From b2cfc4e2e71d0f0a5113ddfbd93c8285cc4d74e4 Mon Sep 17 00:00:00 2001 From: rsc Date: Tue, 30 Sep 2003 17:47:41 +0000 Subject: Initial revision --- src/libregexp/LICENSE | 258 ++++++++++++++ src/libregexp/Make.Darwin-PowerMacintosh | 6 + src/libregexp/Make.FreeBSD-386 | 7 + src/libregexp/Make.HP-UX-9000 | 6 + src/libregexp/Make.Linux-386 | 7 + src/libregexp/Make.NetBSD-386 | 7 + src/libregexp/Make.OSF1-alpha | 6 + src/libregexp/Make.SunOS-sun4u | 2 + src/libregexp/Make.SunOS-sun4u-cc | 6 + src/libregexp/Make.SunOS-sun4u-gcc | 6 + src/libregexp/Makefile | 106 ++++++ src/libregexp/Makefile.MID | 34 ++ src/libregexp/NOTICE | 25 ++ src/libregexp/README | 7 + src/libregexp/bundle.ports | 51 +++ src/libregexp/lib9.h | 6 + src/libregexp/mkfile | 1 + src/libregexp/regaux.c | 76 +++++ src/libregexp/regcomp.c | 557 +++++++++++++++++++++++++++++++ src/libregexp/regcomp.h | 74 ++++ src/libregexp/regerror.c | 14 + src/libregexp/regexec.c | 219 ++++++++++++ src/libregexp/regexp9.3 | 227 +++++++++++++ src/libregexp/regexp9.7 | 150 +++++++++ src/libregexp/regexp9.h | 71 ++++ src/libregexp/regsub.c | 62 ++++ src/libregexp/rpm.spec | 34 ++ src/libregexp/rregaux.c | 26 ++ src/libregexp/rregexec.c | 213 ++++++++++++ src/libregexp/rregsub.c | 55 +++ src/libregexp/test.c | 46 +++ src/libregexp/test2.c | 20 ++ 32 files changed, 2385 insertions(+) create mode 100644 src/libregexp/LICENSE create mode 100644 src/libregexp/Make.Darwin-PowerMacintosh create mode 100644 src/libregexp/Make.FreeBSD-386 create mode 100644 src/libregexp/Make.HP-UX-9000 create mode 100644 src/libregexp/Make.Linux-386 create mode 100644 src/libregexp/Make.NetBSD-386 create mode 100644 src/libregexp/Make.OSF1-alpha create mode 100644 src/libregexp/Make.SunOS-sun4u create mode 100644 src/libregexp/Make.SunOS-sun4u-cc create mode 100644 src/libregexp/Make.SunOS-sun4u-gcc create mode 100644 src/libregexp/Makefile create mode 100644 src/libregexp/Makefile.MID create mode 100644 src/libregexp/NOTICE create mode 100644 src/libregexp/README create mode 100644 src/libregexp/bundle.ports create mode 100644 src/libregexp/lib9.h create mode 100644 src/libregexp/mkfile create mode 100644 src/libregexp/regaux.c create mode 100644 src/libregexp/regcomp.c create mode 100644 src/libregexp/regcomp.h create mode 100644 src/libregexp/regerror.c create mode 100644 src/libregexp/regexec.c create mode 100644 src/libregexp/regexp9.3 create mode 100644 src/libregexp/regexp9.7 create mode 100644 src/libregexp/regexp9.h create mode 100644 src/libregexp/regsub.c create mode 100644 src/libregexp/rpm.spec create mode 100644 src/libregexp/rregaux.c create mode 100644 src/libregexp/rregexec.c create mode 100644 src/libregexp/rregsub.c create mode 100644 src/libregexp/test.c create mode 100644 src/libregexp/test2.c (limited to 'src/libregexp') diff --git a/src/libregexp/LICENSE b/src/libregexp/LICENSE new file mode 100644 index 00000000..a5d7d87d --- /dev/null +++ b/src/libregexp/LICENSE @@ -0,0 +1,258 @@ +The Plan 9 software is provided under the terms of the +Lucent Public License, Version 1.02, reproduced below, +with the following exceptions: + +1. No right is granted to create derivative works of or + to redistribute (other than with the Plan 9 Operating System) + the screen imprinter fonts identified in subdirectory + /lib/font/bit/lucida and printer fonts (Lucida Sans Unicode, Lucida + Sans Italic, Lucida Sans Demibold, Lucida Typewriter, Lucida Sans + Typewriter83), identified in subdirectory /sys/lib/postscript/font. + These directories contain material copyrights by B&H Inc. and Y&Y Inc. + +2. The printer fonts identified in subdirectory /sys/lib/ghostscript/font + are subject to the GNU GPL, reproduced in the file /LICENSE.gpl. + +3. The ghostscript program in the subdirectory /sys/src/cmd/gs is + covered by the Aladdin Free Public License, reproduced in the file + /LICENSE.afpl. + +=================================================================== + +Lucent Public License Version 1.02 + +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS PUBLIC +LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE +PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a. in the case of Lucent Technologies Inc. ("LUCENT"), the Original + Program, and + b. in the case of each Contributor, + + i. changes to the Program, and + ii. additions to the Program; + + where such changes and/or additions to the Program were added to the + Program by such Contributor itself or anyone acting on such + Contributor's behalf, and the Contributor explicitly consents, in + accordance with Section 3C, to characterization of the changes and/or + additions as Contributions. + +"Contributor" means LUCENT and any other entity that has Contributed a +Contribution to the Program. + +"Distributor" means a Recipient that distributes the Program, +modifications to the Program, or any part thereof. + +"Licensed Patents" mean patent claims licensable by a Contributor +which are necessarily infringed by the use or sale of its Contribution +alone or when combined with the Program. + +"Original Program" means the original version of the software +accompanying this Agreement as released by LUCENT, including source +code, object code and documentation, if any. + +"Program" means the Original Program and Contributions or any part +thereof + +"Recipient" means anyone who receives the Program under this +Agreement, including all Contributors. + +2. GRANT OF RIGHTS + + a. Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free copyright + license to reproduce, prepare derivative works of, publicly display, + publicly perform, distribute and sublicense the Contribution of such + Contributor, if any, and such derivative works, in source code and + object code form. + + b. Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free patent + license under Licensed Patents to make, use, sell, offer to sell, + import and otherwise transfer the Contribution of such Contributor, if + any, in source code and object code form. The patent license granted + by a Contributor shall also apply to the combination of the + Contribution of that Contributor and the Program if, at the time the + Contribution is added by the Contributor, such addition of the + Contribution causes such combination to be covered by the Licensed + Patents. The patent license granted by a Contributor shall not apply + to (i) any other combinations which include the Contribution, nor to + (ii) Contributions of other Contributors. No hardware per se is + licensed hereunder. + + c. Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the + patent or other intellectual property rights of any other entity. Each + Contributor disclaims any liability to Recipient for claims brought by + any other entity based on infringement of intellectual property rights + or otherwise. As a condition to exercising the rights and licenses + granted hereunder, each Recipient hereby assumes sole responsibility + to secure any other intellectual property rights needed, if any. For + example, if a third party patent license is required to allow + Recipient to distribute the Program, it is Recipient's responsibility + to acquire that license before distributing the Program. + + d. Each Contributor represents that to its knowledge it has sufficient + copyright rights in its Contribution, if any, to grant the copyright + license set forth in this Agreement. + +3. REQUIREMENTS + +A. Distributor may choose to distribute the Program in any form under +this Agreement or under its own license agreement, provided that: + + a. it complies with the terms and conditions of this Agreement; + + b. if the Program is distributed in source code or other tangible + form, a copy of this Agreement or Distributor's own license agreement + is included with each copy of the Program; and + + c. if distributed under Distributor's own license agreement, such + license agreement: + + i. effectively disclaims on behalf of all Contributors all warranties + and conditions, express and implied, including warranties or + conditions of title and non-infringement, and implied warranties or + conditions of merchantability and fitness for a particular purpose; + ii. effectively excludes on behalf of all Contributors all liability + for damages, including direct, indirect, special, incidental and + consequential damages, such as lost profits; and + iii. states that any provisions which differ from this Agreement are + offered by that Contributor alone and not by any other party. + +B. Each Distributor must include the following in a conspicuous + location in the Program: + + Copyright (C) 2003, Lucent Technologies Inc. and others. All Rights + Reserved. + +C. In addition, each Contributor must identify itself as the +originator of its Contribution in a manner that reasonably allows +subsequent Recipients to identify the originator of the Contribution. +Also, each Contributor must agree that the additions and/or changes +are intended to be a Contribution. Once a Contribution is contributed, +it may not thereafter be revoked. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain +responsibilities with respect to end users, business partners and the +like. While this license is intended to facilitate the commercial use +of the Program, the Distributor who includes the Program in a +commercial product offering should do so in a manner which does not +create potential liability for Contributors. Therefore, if a +Distributor includes the Program in a commercial product offering, +such Distributor ("Commercial Distributor") hereby agrees to defend +and indemnify every Contributor ("Indemnified Contributor") against +any losses, damages and costs (collectively"Losses") arising from +claims, lawsuits and other legal actions brought by a third party +against the Indemnified Contributor to the extent caused by the acts +or omissions of such Commercial Distributor in connection with its +distribution of the Program in a commercial product offering. The +obligations in this section do not apply to any claims or Losses +relating to any actual or alleged intellectual property infringement. +In order to qualify, an Indemnified Contributor must: a) promptly +notify the Commercial Distributor in writing of such claim, and b) +allow the Commercial Distributor to control, and cooperate with the +Commercial Distributor in, the defense and any related settlement +negotiations. The Indemnified Contributor may participate in any such +claim at its own expense. + +For example, a Distributor might include the Program in a commercial +product offering, Product X. That Distributor is then a Commercial +Distributor. If that Commercial Distributor then makes performance +claims, or offers warranties related to Product X, those performance +claims and warranties are such Commercial Distributor's responsibility +alone. Under this section, the Commercial Distributor would have to +defend claims against the Contributors related to those performance +claims and warranties, and if a court requires any Contributor to pay +any damages as a result, the Commercial Distributor must pay those +damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS +PROVIDED ON AN"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY +WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY +OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely +responsible for determining the appropriateness of using and +distributing the Program and assumes all risks associated with its +exercise of rights under this Agreement, including but not limited to +the risks and costs of program errors, compliance with applicable +laws, damage to or loss of data, programs or equipment, and +unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR +ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING +WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR +DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED +HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +7. EXPORT CONTROL + +Recipient agrees that Recipient alone is responsible for compliance +with the United States export administration regulations (and the +export control laws and regulation of any other countries). + +8. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of +the remainder of the terms of this Agreement, and without further +action by the parties hereto, such provision shall be reformed to the +minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against a Contributor with +respect to a patent applicable to software (including a cross-claim or +counterclaim in a lawsuit), then any patent licenses granted by that +Contributor to such Recipient under this Agreement shall terminate as +of the date such litigation is filed. In addition, if Recipient +institutes patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Program +itself (excluding combinations of the Program with other software or +hardware) infringes such Recipient's patent(s), then such Recipient's +rights granted under Section 2(b) shall terminate as of the date such +litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it +fails to comply with any of the material terms or conditions of this +Agreement and does not cure such failure in a reasonable period of +time after becoming aware of such noncompliance. If all Recipient's +rights under this Agreement terminate, Recipient agrees to cease use +and distribution of the Program as soon as reasonably practicable. +However, Recipient's obligations under this Agreement and any licenses +granted by Recipient relating to the Program shall continue and +survive. + +LUCENT may publish new versions (including revisions) of this +Agreement from time to time. Each new version of the Agreement will be +given a distinguishing version number. The Program (including +Contributions) may always be distributed subject to the version of the +Agreement under which it was received. In addition, after a new +version of the Agreement is published, Contributor may elect to +distribute the Program (including its Contributions) under the new +version. No one other than LUCENT has the right to modify this +Agreement. Except as expressly stated in Sections 2(a) and 2(b) above, +Recipient receives no rights or licenses to the intellectual property +of any Contributor under this Agreement, whether expressly, by +implication, estoppel or otherwise. All rights in the Program not +expressly granted under this Agreement are reserved. + +This Agreement is governed by the laws of the State of New York and +the intellectual property laws of the United States of America. No +party to this Agreement will bring a legal action under this Agreement +more than one year after the cause of action arose. Each party waives +its rights to a jury trial in any resulting litigation. + diff --git a/src/libregexp/Make.Darwin-PowerMacintosh b/src/libregexp/Make.Darwin-PowerMacintosh new file mode 100644 index 00000000..14b8d4e7 --- /dev/null +++ b/src/libregexp/Make.Darwin-PowerMacintosh @@ -0,0 +1,6 @@ +CC=gcc +CFLAGS+=-Wall -Wno-missing-braces -Wno-parentheses -Wno-switch -O2 -g -c -I. -I${PREFIX}/include +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O diff --git a/src/libregexp/Make.FreeBSD-386 b/src/libregexp/Make.FreeBSD-386 new file mode 100644 index 00000000..087ed3ab --- /dev/null +++ b/src/libregexp/Make.FreeBSD-386 @@ -0,0 +1,7 @@ +CC=gcc +CFLAGS+=-Wall -Wno-missing-braces -Wno-parentheses -Wno-switch -O2 -g -c -I. -I$(PREFIX)/include +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O # default, can be overriden by Make.$(SYSNAME) +NAN=nan64.$O diff --git a/src/libregexp/Make.HP-UX-9000 b/src/libregexp/Make.HP-UX-9000 new file mode 100644 index 00000000..edbdc111 --- /dev/null +++ b/src/libregexp/Make.HP-UX-9000 @@ -0,0 +1,6 @@ +CC=cc +CFLAGS=-O -c -Ae -I. +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O diff --git a/src/libregexp/Make.Linux-386 b/src/libregexp/Make.Linux-386 new file mode 100644 index 00000000..74b0252c --- /dev/null +++ b/src/libregexp/Make.Linux-386 @@ -0,0 +1,7 @@ +CC=gcc +CFLAGS+=-Wall -Wno-missing-braces -Wno-parentheses -Wno-switch -O2 -g -c -I. +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O # default, can be overriden by Make.$(SYSNAME) +NAN=nan64.$O diff --git a/src/libregexp/Make.NetBSD-386 b/src/libregexp/Make.NetBSD-386 new file mode 100644 index 00000000..087ed3ab --- /dev/null +++ b/src/libregexp/Make.NetBSD-386 @@ -0,0 +1,7 @@ +CC=gcc +CFLAGS+=-Wall -Wno-missing-braces -Wno-parentheses -Wno-switch -O2 -g -c -I. -I$(PREFIX)/include +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O # default, can be overriden by Make.$(SYSNAME) +NAN=nan64.$O diff --git a/src/libregexp/Make.OSF1-alpha b/src/libregexp/Make.OSF1-alpha new file mode 100644 index 00000000..3d45279b --- /dev/null +++ b/src/libregexp/Make.OSF1-alpha @@ -0,0 +1,6 @@ +CC=cc +CFLAGS+=-g -c -I. +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O diff --git a/src/libregexp/Make.SunOS-sun4u b/src/libregexp/Make.SunOS-sun4u new file mode 100644 index 00000000..c5fe67b8 --- /dev/null +++ b/src/libregexp/Make.SunOS-sun4u @@ -0,0 +1,2 @@ +include Make.SunOS-sun4u-$(CC) +NAN=nan64.$O diff --git a/src/libregexp/Make.SunOS-sun4u-cc b/src/libregexp/Make.SunOS-sun4u-cc new file mode 100644 index 00000000..829301de --- /dev/null +++ b/src/libregexp/Make.SunOS-sun4u-cc @@ -0,0 +1,6 @@ +CC=cc +CFLAGS+=-g -c -I. -O +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O diff --git a/src/libregexp/Make.SunOS-sun4u-gcc b/src/libregexp/Make.SunOS-sun4u-gcc new file mode 100644 index 00000000..5c415948 --- /dev/null +++ b/src/libregexp/Make.SunOS-sun4u-gcc @@ -0,0 +1,6 @@ +CC=gcc +CFLAGS+=-Wall -Wno-missing-braces -Wno-parentheses -Wno-switch -O2 -g -c +O=o +AR=ar +ARFLAGS=rvc +NAN=nan64.$O diff --git a/src/libregexp/Makefile b/src/libregexp/Makefile new file mode 100644 index 00000000..498359a3 --- /dev/null +++ b/src/libregexp/Makefile @@ -0,0 +1,106 @@ + +# this works in gnu make +SYSNAME:=${shell uname} +OBJTYPE:=${shell uname -m | sed 's;i.86;386;; s;/.*;;; s; ;;g'} + +# this works in bsd make +SYSNAME!=uname +OBJTYPE!=uname -m | sed 's;i.86;386;; s;/.*;;; s; ;;g' + +# the gnu rules will mess up bsd but not vice versa, +# hence the gnu rules come first. + +include Make.$(SYSNAME)-$(OBJTYPE) + +PREFIX=/usr/local + +NUKEFILES= + +TGZFILES= + +LIB=libregexp9.a +VERSION=2.0 +PORTPLACE=devel/libregexp9 +NAME=libregexp9 + +OFILES=\ + regcomp.$O\ + regerror.$O\ + regexec.$O\ + regsub.$O\ + regaux.$O\ + rregsub.$O\ + rregaux.$O\ + rregexec.$O\ + +HFILES=\ + regexp9.h\ + regcomp.h\ + +all: $(LIB) + +install: $(LIB) + test -d $(PREFIX)/man/man3 || mkdir $(PREFIX)/man/man3 + test -d $(PREFIX)/man/man7 || mkdir $(PREFIX)/man/man7 + install -m 0644 regexp9.3 $(PREFIX)/man/man3/regexp9.3 + install -m 0644 regexp9.7 $(PREFIX)/man/man7/regexp9.7 + install -m 0644 $(LIB) $(PREFIX)/lib/$(LIB) + install -m 0644 regexp9.h $(PREFIX)/include/regexp9.h + +test: test.$O $(LIB) + $(CC) -o test test.$O $(LIB) -L/usr/local/lib -lfmt -lutf + +test2: test2.$O $(LIB) + $(CC) -o test2 test2.$O $(LIB) -L/usr/local/lib -lfmt -lutf + +$(LIB): $(OFILES) + $(AR) $(ARFLAGS) $(LIB) $(OFILES) + +NUKEFILES+=$(LIB) +.c.$O: + $(CC) $(CFLAGS) -I$(PREFIX)/include $*.c + +%.$O: %.c + $(CC) $(CFLAGS) -I$(PREFIX)/include $*.c + + +$(OFILES): $(HFILES) + +tgz: + rm -rf $(NAME)-$(VERSION) + mkdir $(NAME)-$(VERSION) + cp Makefile Make.* README LICENSE NOTICE *.[ch137] rpm.spec bundle.ports $(TGZFILES) $(NAME)-$(VERSION) + tar cf - $(NAME)-$(VERSION) | gzip >$(NAME)-$(VERSION).tgz + rm -rf $(NAME)-$(VERSION) + +clean: + rm -f $(OFILES) $(LIB) + +nuke: + rm -f $(OFILES) *.tgz *.rpm $(NUKEFILES) + +rpm: + make tgz + cp $(NAME)-$(VERSION).tgz /usr/src/RPM/SOURCES + rpm -ba rpm.spec + cp /usr/src/RPM/SRPMS/$(NAME)-$(VERSION)-1.src.rpm . + cp /usr/src/RPM/RPMS/i586/$(NAME)-$(VERSION)-1.i586.rpm . + scp *.rpm rsc@amsterdam.lcs.mit.edu:public_html/software + +PORTDIR=/usr/ports/$(PORTPLACE) + +ports: + make tgz + rm -rf $(PORTDIR) + mkdir $(PORTDIR) + cp $(NAME)-$(VERSION).tgz /usr/ports/distfiles + cat bundle.ports | (cd $(PORTDIR) && awk '$$1=="---" && $$3=="---" { ofile=$$2; next} {if(ofile) print >ofile}') + (cd $(PORTDIR); make makesum) + (cd $(PORTDIR); make) + (cd $(PORTDIR); /usr/local/bin/portlint) + rm -rf $(PORTDIR)/work + shar `find $(PORTDIR)` > ports.shar + (cd $(PORTDIR); tar cf - *) | gzip >$(NAME)-$(VERSION)-ports.tgz + scp *.tgz rsc@amsterdam.lcs.mit.edu:public_html/software + +.phony: all clean nuke install tgz rpm ports diff --git a/src/libregexp/Makefile.MID b/src/libregexp/Makefile.MID new file mode 100644 index 00000000..fa8a3a93 --- /dev/null +++ b/src/libregexp/Makefile.MID @@ -0,0 +1,34 @@ +LIB=libregexp9.a +VERSION=2.0 +PORTPLACE=devel/libregexp9 +NAME=libregexp9 + +OFILES=\ + regcomp.$O\ + regerror.$O\ + regexec.$O\ + regsub.$O\ + regaux.$O\ + rregsub.$O\ + rregaux.$O\ + +HFILES=\ + regexp9.h\ + regcomp.h\ + +all: $(LIB) + +install: $(LIB) + test -d $(PREFIX)/man/man3 || mkdir $(PREFIX)/man/man3 + test -d $(PREFIX)/man/man7 || mkdir $(PREFIX)/man/man7 + install -m 0644 regexp9.3 $(PREFIX)/man/man3/regexp9.3 + install -m 0644 regexp9.7 $(PREFIX)/man/man7/regexp9.7 + install -m 0644 $(LIB) $(PREFIX)/lib/$(LIB) + install -m 0644 regexp9.h $(PREFIX)/include/regexp9.h + +test: test.$O $(LIB) + $(CC) -o test test.$O $(LIB) -L/usr/local/lib -lfmt -lutf + +test2: test2.$O $(LIB) + $(CC) -o test2 test2.$O $(LIB) -L/usr/local/lib -lfmt -lutf + diff --git a/src/libregexp/NOTICE b/src/libregexp/NOTICE new file mode 100644 index 00000000..784ee1d0 --- /dev/null +++ b/src/libregexp/NOTICE @@ -0,0 +1,25 @@ +Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. +Portions Copyright © 2000-2002 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. + +Under a licence agreement with Lucent Technologies Inc. effective 1st March 2000, +Vita Nuova Holdings Limited has the right to determine (within a specified scope) +the form and content of sublicences for this software. + +Vita Nuova Holdings Limited now makes this software available as Free +Software under the terms of the `GNU Lesser Public License, Version 2.1' +(see the file LICENCE or http://www.fsf.org/copyleft/lesser.html for +the full terms and conditions). One of the conditions of that licence +is that you must keep intact all notices that refer to that licence and to the absence of +of any warranty: for this software, note that includes this NOTICE file in particular. + +This suite of programs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +`GNU General Public License' for more details. + +This copyright NOTICE applies to all files in this directory and +subdirectories, unless another copyright notice appears in a given +file or subdirectory. If you take code from this software to use in +other programs, you must somehow include with it an appropriate +copyright notice that includes the copyright notice and the other +notices above. diff --git a/src/libregexp/README b/src/libregexp/README new file mode 100644 index 00000000..1f625112 --- /dev/null +++ b/src/libregexp/README @@ -0,0 +1,7 @@ +This is a Unix port of the Plan 9 regular expression library, +originally done for the Inferno operating system. + +Russ Cox repackaged this to build as a standalone +Unix library. Send comments about packaging to +Russ Cox + diff --git a/src/libregexp/bundle.ports b/src/libregexp/bundle.ports new file mode 100644 index 00000000..2205e4b9 --- /dev/null +++ b/src/libregexp/bundle.ports @@ -0,0 +1,51 @@ +--- Makefile --- +# New ports collection makefile for: libbio +# Date Created: 11 Feb 2003 +# Whom: rsc +# +# THIS LINE NEEDS REPLACING. IT'S HERE TO GET BY PORTLINT +# $FreeBSD: ports/devel/libbio/Makefile,v 1.1 2003/02/12 00:51:22 rsc Exp $ + +PORTNAME= libregexp9 +PORTVERSION= 2.0 +CATEGORIES= devel +MASTER_SITES= http://pdos.lcs.mit.edu/~rsc/software/ +EXTRACT_SUFX= .tgz + +MAINTAINER= rsc@post.harvard.edu + +DEPENDS= ${PORTSDIR}/devel/libfmt ${PORTSDIR}/devel/libutf + +MAN3= regexp9.3 +MAN7= regexp9.7 +USE_REINPLACE= yes + +.include + +post-patch: + ${REINPLACE_CMD} -e 's,$$(PREFIX),${PREFIX},g' ${WRKSRC}/Makefile + +.include +--- pkg-comment --- +Simple regular expression library from Plan 9 +--- pkg-descr --- +Libregexp9 is a port of Plan 9's regexp library. +It is small and simple and provides the traditional +extended regular expressions (as opposed to the +current extended regular expressions, which add {} +and various \x character classes, among other +complications). + +It handles Unicode in wide character or UTF8 format! + +WWW: http://pdos.lcs.mit.edu/~rsc/software/ +http://plan9.bell-labs.com/magic/man2html/2/regexp + +Russ Cox +rsc@post.harvard.edu +--- pkg-plist --- +lib/libregexp9.a +include/regex9.h +--- /dev/null --- +This is just a way to make sure blank lines don't +creep into pkg-plist. diff --git a/src/libregexp/lib9.h b/src/libregexp/lib9.h new file mode 100644 index 00000000..d022656d --- /dev/null +++ b/src/libregexp/lib9.h @@ -0,0 +1,6 @@ +#include +#include +#include +#include +#include + diff --git a/src/libregexp/mkfile b/src/libregexp/mkfile new file mode 100644 index 00000000..bb99a25a --- /dev/null +++ b/src/libregexp/mkfile @@ -0,0 +1 @@ +<../libutf/mkfile diff --git a/src/libregexp/regaux.c b/src/libregexp/regaux.c new file mode 100644 index 00000000..956c1eb0 --- /dev/null +++ b/src/libregexp/regaux.c @@ -0,0 +1,76 @@ +#include "lib9.h" +#include "regexp9.h" +#include "regcomp.h" + + +/* + * save a new match in mp + */ +extern void +_renewmatch(Resub *mp, int ms, Resublist *sp) +{ + int i; + + if(mp==0 || ms<=0) + return; + if(mp[0].s.sp==0 || sp->m[0].s.spm[0].s.sp==mp[0].s.sp && sp->m[0].e.ep>mp[0].e.ep)){ + for(i=0; im[i]; + for(; iinst; p++){ + if(p->inst == ip){ + if((sep)->m[0].s.sp < p->se.m[0].s.sp) + p->se = *sep; + return 0; + } + } + p->inst = ip; + p->se = *sep; + (++p)->inst = 0; + return p; +} + +/* + * same as renewthread, but called with + * initial empty start pointer. + */ +extern Relist* +_renewemptythread(Relist *lp, /* _relist to add to */ + Reinst *ip, /* instruction to add */ + char *sp) /* pointers to subexpressions */ +{ + Relist *p; + + for(p=lp; p->inst; p++){ + if(p->inst == ip){ + if(sp < p->se.m[0].s.sp) { + memset((void *)&p->se, 0, sizeof(p->se)); + p->se.m[0].s.sp = sp; + } + return 0; + } + } + p->inst = ip; + memset((void *)&p->se, 0, sizeof(p->se)); + p->se.m[0].s.sp = sp; + (++p)->inst = 0; + return p; +} + diff --git a/src/libregexp/regcomp.c b/src/libregexp/regcomp.c new file mode 100644 index 00000000..6c6939c6 --- /dev/null +++ b/src/libregexp/regcomp.c @@ -0,0 +1,557 @@ +#include "lib9.h" +#include "regexp9.h" +#include "regcomp.h" + +#define TRUE 1 +#define FALSE 0 + +/* + * Parser Information + */ +typedef +struct Node +{ + Reinst* first; + Reinst* last; +}Node; + +Reprog RePrOg; + +#define NSTACK 20 +static Node andstack[NSTACK]; +static Node *andp; +static int atorstack[NSTACK]; +static int* atorp; +static int cursubid; /* id of current subexpression */ +static int subidstack[NSTACK]; /* parallel to atorstack */ +static int* subidp; +static int lastwasand; /* Last token was operand */ +static int nbra; +static char* exprp; /* pointer to next character in source expression */ +static int lexdone; +static int nclass; +static Reclass*classp; +static Reinst* freep; +static int errors; +static Rune yyrune; /* last lex'd rune */ +static Reclass*yyclassp; /* last lex'd class */ + +/* predeclared crap */ +static void operator(int); +static void pushand(Reinst*, Reinst*); +static void pushator(int); +static void evaluntil(int); +static int bldcclass(void); + +static jmp_buf regkaboom; + +static void +rcerror(char *s) +{ + errors++; + regerror(s); + longjmp(regkaboom, 1); +} + +static Reinst* +newinst(int t) +{ + freep->type = t; + freep->u2.left = 0; + freep->u1.right = 0; + return freep++; +} + +static void +operand(int t) +{ + Reinst *i; + + if(lastwasand) + operator(CAT); /* catenate is implicit */ + i = newinst(t); + + if(t == CCLASS || t == NCCLASS) + i->u1.cp = yyclassp; + if(t == RUNE) + i->u1.r = yyrune; + + pushand(i, i); + lastwasand = TRUE; +} + +static void +operator(int t) +{ + if(t==RBRA && --nbra<0) + rcerror("unmatched right paren"); + if(t==LBRA){ + if(++cursubid >= NSUBEXP) + rcerror ("too many subexpressions"); + nbra++; + if(lastwasand) + operator(CAT); + } else + evaluntil(t); + if(t != RBRA) + pushator(t); + lastwasand = FALSE; + if(t==STAR || t==QUEST || t==PLUS || t==RBRA) + lastwasand = TRUE; /* these look like operands */ +} + +static void +regerr2(char *s, int c) +{ + char buf[100]; + char *cp = buf; + while(*s) + *cp++ = *s++; + *cp++ = c; + *cp = '\0'; + rcerror(buf); +} + +static void +cant(char *s) +{ + char buf[100]; + strcpy(buf, "can't happen: "); + strcat(buf, s); + rcerror(buf); +} + +static void +pushand(Reinst *f, Reinst *l) +{ + if(andp >= &andstack[NSTACK]) + cant("operand stack overflow"); + andp->first = f; + andp->last = l; + andp++; +} + +static void +pushator(int t) +{ + if(atorp >= &atorstack[NSTACK]) + cant("operator stack overflow"); + *atorp++ = t; + *subidp++ = cursubid; +} + +static Node* +popand(int op) +{ + Reinst *inst; + + if(andp <= &andstack[0]){ + regerr2("missing operand for ", op); + inst = newinst(NOP); + pushand(inst,inst); + } + return --andp; +} + +static int +popator(void) +{ + if(atorp <= &atorstack[0]) + cant("operator stack underflow"); + --subidp; + return *--atorp; +} + +static void +evaluntil(int pri) +{ + Node *op1, *op2; + Reinst *inst1, *inst2; + + while(pri==RBRA || atorp[-1]>=pri){ + switch(popator()){ + default: + rcerror("unknown operator in evaluntil"); + break; + case LBRA: /* must have been RBRA */ + op1 = popand('('); + inst2 = newinst(RBRA); + inst2->u1.subid = *subidp; + op1->last->u2.next = inst2; + inst1 = newinst(LBRA); + inst1->u1.subid = *subidp; + inst1->u2.next = op1->first; + pushand(inst1, inst2); + return; + case OR: + op2 = popand('|'); + op1 = popand('|'); + inst2 = newinst(NOP); + op2->last->u2.next = inst2; + op1->last->u2.next = inst2; + inst1 = newinst(OR); + inst1->u1.right = op1->first; + inst1->u2.left = op2->first; + pushand(inst1, inst2); + break; + case CAT: + op2 = popand(0); + op1 = popand(0); + op1->last->u2.next = op2->first; + pushand(op1->first, op2->last); + break; + case STAR: + op2 = popand('*'); + inst1 = newinst(OR); + op2->last->u2.next = inst1; + inst1->u1.right = op2->first; + pushand(inst1, inst1); + break; + case PLUS: + op2 = popand('+'); + inst1 = newinst(OR); + op2->last->u2.next = inst1; + inst1->u1.right = op2->first; + pushand(op2->first, inst1); + break; + case QUEST: + op2 = popand('?'); + inst1 = newinst(OR); + inst2 = newinst(NOP); + inst1->u2.left = inst2; + inst1->u1.right = op2->first; + op2->last->u2.next = inst2; + pushand(inst1, inst2); + break; + } + } +} + +static Reprog* +optimize(Reprog *pp) +{ + Reinst *inst, *target; + int size; + Reprog *npp; + Reclass *cl; + int diff; + + /* + * get rid of NOOP chains + */ + for(inst=pp->firstinst; inst->type!=END; inst++){ + target = inst->u2.next; + while(target->type == NOP) + target = target->u2.next; + inst->u2.next = target; + } + + /* + * The original allocation is for an area larger than + * necessary. Reallocate to the actual space used + * and then relocate the code. + */ + size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); + npp = (Reprog *)realloc(pp, size); + if(npp==0 || npp==pp) + return pp; + diff = (char *)npp - (char *)pp; + freep = (Reinst *)((char *)freep + diff); + for(inst=npp->firstinst; insttype){ + case OR: + case STAR: + case PLUS: + case QUEST: + *(char **)&inst->u1.right += diff; + break; + case CCLASS: + case NCCLASS: + *(char **)&inst->u1.right += diff; + cl = inst->u1.cp; + *(char **)&cl->end += diff; + break; + } + *(char **)&inst->u2.left += diff; + } + *(char **)&npp->startinst += diff; + return npp; +} + +#ifdef DEBUG +static void +dumpstack(void){ + Node *stk; + int *ip; + + print("operators\n"); + for(ip=atorstack; ipfirst->type, stk->last->type); +} + +static void +dump(Reprog *pp) +{ + Reinst *l; + Rune *p; + + l = pp->firstinst; + do{ + print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, + l->u2.left-pp->firstinst, l->u1.right-pp->firstinst); + if(l->type == RUNE) + print("\t%C\n", l->r); + else if(l->type == CCLASS || l->type == NCCLASS){ + print("\t["); + if(l->type == NCCLASS) + print("^"); + for(p = l->cp->spans; p < l->cp->end; p += 2) + if(p[0] == p[1]) + print("%C", p[0]); + else + print("%C-%C", p[0], p[1]); + print("]\n"); + } else + print("\n"); + }while(l++->type); +} +#endif + +static Reclass* +newclass(void) +{ + if(nclass >= NCLASS) + regerr2("too many character classes; limit", NCLASS+'0'); + return &(classp[nclass++]); +} + +static int +nextc(Rune *rp) +{ + if(lexdone){ + *rp = 0; + return 1; + } + exprp += chartorune(rp, exprp); + if(*rp == L'\\'){ + exprp += chartorune(rp, exprp); + return 1; + } + if(*rp == 0) + lexdone = 1; + return 0; +} + +static int +lex(int literal, int dot_type) +{ + int quoted; + + quoted = nextc(&yyrune); + if(literal || quoted){ + if(yyrune == 0) + return END; + return RUNE; + } + + switch(yyrune){ + case 0: + return END; + case L'*': + return STAR; + case L'?': + return QUEST; + case L'+': + return PLUS; + case L'|': + return OR; + case L'.': + return dot_type; + case L'(': + return LBRA; + case L')': + return RBRA; + case L'^': + return BOL; + case L'$': + return EOL; + case L'[': + return bldcclass(); + } + return RUNE; +} + +static int +bldcclass(void) +{ + int type; + Rune r[NCCRUNE]; + Rune *p, *ep, *np; + Rune rune; + int quoted; + + /* we have already seen the '[' */ + type = CCLASS; + yyclassp = newclass(); + + /* look ahead for negation */ + /* SPECIAL CASE!!! negated classes don't match \n */ + ep = r; + quoted = nextc(&rune); + if(!quoted && rune == L'^'){ + type = NCCLASS; + quoted = nextc(&rune); + *ep++ = L'\n'; + *ep++ = L'\n'; + } + + /* parse class into a set of spans */ + for(; ep<&r[NCCRUNE];){ + if(rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + if(!quoted && rune == L']') + break; + if(!quoted && rune == L'-'){ + if(ep == r){ + rcerror("malformed '[]'"); + return 0; + } + quoted = nextc(&rune); + if((!quoted && rune == L']') || rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + *(ep-1) = rune; + } else { + *ep++ = rune; + *ep++ = rune; + } + quoted = nextc(&rune); + } + + /* sort on span start */ + for(p = r; p < ep; p += 2){ + for(np = p; np < ep; np += 2) + if(*np < *p){ + rune = np[0]; + np[0] = p[0]; + p[0] = rune; + rune = np[1]; + np[1] = p[1]; + p[1] = rune; + } + } + + /* merge spans */ + np = yyclassp->spans; + p = r; + if(r == ep) + yyclassp->end = np; + else { + np[0] = *p++; + np[1] = *p++; + for(; p < ep; p += 2) + if(p[0] <= np[1]){ + if(p[1] > np[1]) + np[1] = p[1]; + } else { + np += 2; + np[0] = p[0]; + np[1] = p[1]; + } + yyclassp->end = np+2; + } + + return type; +} + +static Reprog* +regcomp1(char *s, int literal, int dot_type) +{ + int token; + Reprog *pp; + + /* get memory for the program */ + pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); + if(pp == 0){ + regerror("out of memory"); + return 0; + } + freep = pp->firstinst; + classp = pp->class; + errors = 0; + + if(setjmp(regkaboom)) + goto out; + + /* go compile the sucker */ + lexdone = 0; + exprp = s; + nclass = 0; + nbra = 0; + atorp = atorstack; + andp = andstack; + subidp = subidstack; + lastwasand = FALSE; + cursubid = 0; + + /* Start with a low priority operator to prime parser */ + pushator(START-1); + while((token = lex(literal, dot_type)) != END){ + if((token&0300) == OPERATOR) + operator(token); + else + operand(token); + } + + /* Close with a low priority operator */ + evaluntil(START); + + /* Force END */ + operand(END); + evaluntil(START); +#ifdef DEBUG + dumpstack(); +#endif + if(nbra) + rcerror("unmatched left paren"); + --andp; /* points to first and only operand */ + pp->startinst = andp->first; +#ifdef DEBUG + dump(pp); +#endif + pp = optimize(pp); +#ifdef DEBUG + print("start: %d\n", andp->first-pp->firstinst); + dump(pp); +#endif +out: + if(errors){ + free(pp); + pp = 0; + } + return pp; +} + +extern Reprog* +regcomp(char *s) +{ + return regcomp1(s, 0, ANY); +} + +extern Reprog* +regcomplit(char *s) +{ + return regcomp1(s, 1, ANY); +} + +extern Reprog* +regcompnl(char *s) +{ + return regcomp1(s, 0, ANYNL); +} diff --git a/src/libregexp/regcomp.h b/src/libregexp/regcomp.h new file mode 100644 index 00000000..a6728b0e --- /dev/null +++ b/src/libregexp/regcomp.h @@ -0,0 +1,74 @@ +/* + * substitution list + */ +#define uchar __reuchar +typedef unsigned char uchar; +#define nelem(x) (sizeof(x)/sizeof((x)[0])) + +#define NSUBEXP 32 +typedef struct Resublist Resublist; +struct Resublist +{ + Resub m[NSUBEXP]; +}; + +/* max character classes per program */ +extern Reprog RePrOg; +#define NCLASS (sizeof(RePrOg.class)/sizeof(Reclass)) + +/* max rune ranges per character class */ +#define NCCRUNE (sizeof(Reclass)/sizeof(Rune)) + +/* + * Actions and Tokens (Reinst types) + * + * 02xx are operators, value == precedence + * 03xx are tokens, i.e. operands for operators + */ +#define RUNE 0177 +#define OPERATOR 0200 /* Bitmask of all operators */ +#define START 0200 /* Start, used for marker on stack */ +#define RBRA 0201 /* Right bracket, ) */ +#define LBRA 0202 /* Left bracket, ( */ +#define OR 0203 /* Alternation, | */ +#define CAT 0204 /* Concatentation, implicit operator */ +#define STAR 0205 /* Closure, * */ +#define PLUS 0206 /* a+ == aa* */ +#define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's */ +#define ANY 0300 /* Any character except newline, . */ +#define ANYNL 0301 /* Any character including newline, . */ +#define NOP 0302 /* No operation, internal use only */ +#define BOL 0303 /* Beginning of line, ^ */ +#define EOL 0304 /* End of line, $ */ +#define CCLASS 0305 /* Character class, [] */ +#define NCCLASS 0306 /* Negated character class, [] */ +#define END 0377 /* Terminate: match found */ + +/* + * regexec execution lists + */ +#define LISTSIZE 10 +#define BIGLISTSIZE (10*LISTSIZE) +typedef struct Relist Relist; +struct Relist +{ + Reinst* inst; /* Reinstruction of the thread */ + Resublist se; /* matched subexpressions in this thread */ +}; +typedef struct Reljunk Reljunk; +struct Reljunk +{ + Relist* relist[2]; + Relist* reliste[2]; + int starttype; + Rune startchar; + char* starts; + char* eol; + Rune* rstarts; + Rune* reol; +}; + +extern Relist* _renewthread(Relist*, Reinst*, Resublist*); +extern void _renewmatch(Resub*, int, Resublist*); +extern Relist* _renewemptythread(Relist*, Reinst*, char*); +extern Relist* _rrenewemptythread(Relist*, Reinst*, Rune*); diff --git a/src/libregexp/regerror.c b/src/libregexp/regerror.c new file mode 100644 index 00000000..2cd1e3e2 --- /dev/null +++ b/src/libregexp/regerror.c @@ -0,0 +1,14 @@ +#include "lib9.h" +#include "regexp9.h" + +void +regerror(char *s) +{ + char buf[132]; + + strcpy(buf, "regerror: "); + strcat(buf, s); + strcat(buf, "\n"); + write(2, buf, strlen(buf)); + exit(1); +} diff --git a/src/libregexp/regexec.c b/src/libregexp/regexec.c new file mode 100644 index 00000000..c9f1eba2 --- /dev/null +++ b/src/libregexp/regexec.c @@ -0,0 +1,219 @@ +#include "lib9.h" +#include "regexp9.h" +#include "regcomp.h" + + +/* + * return 0 if no match + * >0 if a match + * <0 if we ran out of _relist space + */ +static int +regexec1(Reprog *progp, /* program to run */ + char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j +) +{ + int flag=0; + Reinst *inst; + Relist *tlp; + char *s; + int i, checkstart; + Rune r, *rp, *ep; + int n; + Relist* tl; /* This list, next list */ + Relist* nl; + Relist* tle; /* ends of this and next list */ + Relist* nle; + int match; + char *p; + + match = 0; + checkstart = j->starttype; + if(mp) + for(i=0; irelist[0][0].inst = 0; + j->relist[1][0].inst = 0; + + /* Execute machine once for each character, including terminal NUL */ + s = j->starts; + do{ + /* fast check for first char */ + if(checkstart) { + switch(j->starttype) { + case RUNE: + p = utfrune(s, j->startchar); + if(p == 0) + return match; + s = p; + break; + case BOL: + if(s == bol) + break; + p = utfrune(s, '\n'); + if(p == 0) + return match; + s = p; + break; + } + } + r = *(uchar*)s; + if(r < (Rune)Runeself) + n = 1; + else + n = chartorune(&r, s); + + /* switch run lists */ + tl = j->relist[flag]; + tle = j->reliste[flag]; + nl = j->relist[flag^=1]; + nle = j->reliste[flag]; + nl->inst = 0; + + /* Add first instruction to current list */ + if(match == 0) + _renewemptythread(tl, progp->startinst, s); + + /* Execute machine until current list is empty */ + for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ + for(inst = tlp->inst; ; inst = inst->u2.next){ + switch(inst->type){ + case RUNE: /* regular character */ + if(inst->u1.r == r){ + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + } + break; + case LBRA: + tlp->se.m[inst->u1.subid].s.sp = s; + continue; + case RBRA: + tlp->se.m[inst->u1.subid].e.ep = s; + continue; + case ANY: + if(r != '\n') + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case ANYNL: + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case BOL: + if(s == bol || *(s-1) == '\n') + continue; + break; + case EOL: + if(s == j->eol || r == 0 || r == '\n') + continue; + break; + case CCLASS: + ep = inst->u1.cp->end; + for(rp = inst->u1.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]){ + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + } + break; + case NCCLASS: + ep = inst->u1.cp->end; + for(rp = inst->u1.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]) + break; + if(rp == ep) + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case OR: + /* evaluate right choice later */ + if(_renewthread(tlp, inst->u1.right, &tlp->se) == tle) + return -1; + /* efficiency: advance and re-evaluate */ + continue; + case END: /* Match! */ + match = 1; + tlp->se.m[0].e.ep = s; + if(mp != 0) + _renewmatch(mp, ms, &tlp->se); + break; + } + break; + } + } + if(s == j->eol) + break; + checkstart = j->starttype && nl->inst==0; + s += n; + }while(r); + return match; +} + +static int +regexec2(Reprog *progp, /* program to run */ + char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j +) +{ + Relist relist0[BIGLISTSIZE], relist1[BIGLISTSIZE]; + + /* mark space */ + j->relist[0] = relist0; + j->relist[1] = relist1; + j->reliste[0] = relist0 + nelem(relist0) - 2; + j->reliste[1] = relist1 + nelem(relist1) - 2; + + return regexec1(progp, bol, mp, ms, j); +} + +extern int +regexec(Reprog *progp, /* program to run */ + char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms) /* number of elements at mp */ +{ + Reljunk j; + Relist relist0[LISTSIZE], relist1[LISTSIZE]; + int rv; + + /* + * use user-specified starting/ending location if specified + */ + j.starts = bol; + j.eol = 0; + if(mp && ms>0){ + if(mp->s.sp) + j.starts = mp->s.sp; + if(mp->e.ep) + j.eol = mp->e.ep; + } + j.starttype = 0; + j.startchar = 0; + if(progp->startinst->type == RUNE && progp->startinst->u1.r < (Rune)Runeself) { + j.starttype = RUNE; + j.startchar = progp->startinst->u1.r; + } + if(progp->startinst->type == BOL) + j.starttype = BOL; + + /* mark space */ + j.relist[0] = relist0; + j.relist[1] = relist1; + j.reliste[0] = relist0 + nelem(relist0) - 2; + j.reliste[1] = relist1 + nelem(relist1) - 2; + + rv = regexec1(progp, bol, mp, ms, &j); + if(rv >= 0) + return rv; + rv = regexec2(progp, bol, mp, ms, &j); + if(rv >= 0) + return rv; + return -1; +} diff --git a/src/libregexp/regexp9.3 b/src/libregexp/regexp9.3 new file mode 100644 index 00000000..f2603562 --- /dev/null +++ b/src/libregexp/regexp9.3 @@ -0,0 +1,227 @@ +.TH REGEXP9 3 +.de EX +.nf +.ft B +.. +.de EE +.fi +.ft R +.. +.de LR +.if t .BR \\$1 \\$2 +.if n .RB ` \\$1 '\\$2 +.. +.de L +.nh +.if t .B \\$1 +.if n .RB ` \\$1 ' +.. +.SH NAME +regcomp, regcomplit, regcompnl, regexec, regsub, regerror \- Plan 9 regular expression library +.SH SYNOPSIS +.B #include +.PP +.ta \w'\fLRegprog 'u +.B +Reprog *regcomp(char *exp) +.PP +.B +Reprog *regcomplit(char *exp) +.PP +.B +Reprog *regcompnl(char *exp) +.PP +.nf +.B +int regexec(Reprog *prog, char *string, Resub *match, int msize) +.PP +.nf +.B +void regsub(char *source, char *dest, int dlen, Resub *match, int msize) +.PP +.nf +.B +int rregexec(Reprog *prog, Rune *string, Resub *match, int msize) +.PP +.nf +.B +void rregsub(Rune *source, Rune *dest, int dlen, Resub *match, int msize) +.PP +.B +void regerror(char *msg) +.SH DESCRIPTION +.I Regcomp +compiles a +regular expression and returns +a pointer to the generated description. +The space is allocated by +.IR malloc (3) +and may be released by +.IR free . +Regular expressions are exactly as in +.IR regexp9 (7). +.PP +.I Regcomplit +is like +.I regcomp +except that all characters are treated literally. +.I Regcompnl +is like +.I regcomp +except that the +.B . +metacharacter matches all characters, including newlines. +.PP +.I Regexec +matches a null-terminated +.I string +against the compiled regular expression in +.IR prog . +If it matches, +.I regexec +returns +.B 1 +and fills in the array +.I match +with character pointers to the substrings of +.I string +that correspond to the +parenthesized subexpressions of +.IR exp : +.BI match[ i ].sp +points to the beginning and +.BI match[ i ].ep +points just beyond +the end of the +.IR i th +substring. +(Subexpression +.I i +begins at the +.IR i th +left parenthesis, counting from 1.) +Pointers in +.B match[0] +pick out the substring that corresponds to +the whole regular expression. +Unused elements of +.I match +are filled with zeros. +Matches involving +.LR * , +.LR + , +and +.L ? +are extended as far as possible. +The number of array elements in +.I match +is given by +.IR msize . +The structure of elements of +.I match +is: +.IP +.EX +typedef struct { + union { + char *sp; + Rune *rsp; + } s; + union { + char *ep; + Rune *rep; + } e; +} Resub; +.EE +.LP +If +.B match[0].s.sp +is nonzero on entry, +.I regexec +starts matching at that point within +.IR string . +If +.B match[0].e.ep +is nonzero on entry, +the last character matched is the one +preceding that point. +.PP +.I Regsub +places in +.I dest +a substitution instance of +.I source +in the context of the last +.I regexec +performed using +.IR match . +Each instance of +.BI \e n\f1, +where +.I n +is a digit, is replaced by the +string delimited by +.BI match[ n ].s.sp +and +.BI match[ n ].e.ep\f1. +Each instance of +.L & +is replaced by the string delimited by +.B match[0].s.sp +and +.BR match[0].e.ep . +The substitution will always be null terminated and +trimmed to fit into dlen bytes. +.PP +.IR Regerror , +called whenever an error is detected in +.IR regcomp , +writes the string +.I msg +on the standard error file and exits. +.I Regerror +can be replaced to perform +special error processing. +If the user supplied +.I regerror +returns rather than exits, +.I regcomp +will return 0. +.PP +.I Rregexec +and +.I rregsub +are variants of +.I regexec +and +.I regsub +that use strings of +.B Runes +instead of strings of +.BR chars . +With these routines, the +.I rsp +and +.I rep +fields of the +.I match +array elements should be used. +.SH "SEE ALSO" +.IR grep (1), +.IR regexp9 (7) +.SH DIAGNOSTICS +.I Regcomp +returns +.B 0 +for an illegal expression +or other failure. +.I Regexec +returns 0 +if +.I string +is not matched. +.SH HISTORY +This particular regular expression was first written by Rob Pike for Plan 9. +It has also appeared as part of the Inferno operating system. +.SH BUGS +There is no way to specify or match a NUL character; NULs terminate patterns and strings. diff --git a/src/libregexp/regexp9.7 b/src/libregexp/regexp9.7 new file mode 100644 index 00000000..14a90d0f --- /dev/null +++ b/src/libregexp/regexp9.7 @@ -0,0 +1,150 @@ +.TH REGEXP9 7 +.de EX +.nf +.ft B +.. +.de EE +.fi +.ft R +.. +.de LR +.if t .BR \\$1 \\$2 +.if n .RB ` \\$1 '\\$2 +.. +.de L +.nh +.if t .B \\$1 +.if n .RB ` \\$1 ' +.. +.SH NAME +regexp9 \- Plan 9 regular expression notation +.SH DESCRIPTION +This manual page describes the regular expression +syntax used by the Plan 9 regular expression library +.IR regexp9 (3). +It is the form used by +.IR egrep (1) +before +.I egrep +got complicated. +.PP +A +.I "regular expression" +specifies +a set of strings of characters. +A member of this set of strings is said to be +.I matched +by the regular expression. In many applications +a delimiter character, commonly +.LR / , +bounds a regular expression. +In the following specification for regular expressions +the word `character' means any character (rune) but newline. +.PP +The syntax for a regular expression +.B e0 +is +.IP +.EX +e3: literal | charclass | '.' | '^' | '$' | '(' e0 ')' + +e2: e3 + | e2 REP + +REP: '*' | '+' | '?' + +e1: e2 + | e1 e2 + +e0: e1 + | e0 '|' e1 +.EE +.PP +A +.B literal +is any non-metacharacter, or a metacharacter +(one of +.BR .*+?[]()|\e^$ ), +or the delimiter +preceded by +.LR \e . +.PP +A +.B charclass +is a nonempty string +.I s +bracketed +.BI [ \|s\| ] +(or +.BI [^ s\| ]\fR); +it matches any character in (or not in) +.IR s . +A negated character class never +matches newline. +A substring +.IB a - b\f1, +with +.I a +and +.I b +in ascending +order, stands for the inclusive +range of +characters between +.I a +and +.IR b . +In +.IR s , +the metacharacters +.LR - , +.LR ] , +an initial +.LR ^ , +and the regular expression delimiter +must be preceded by a +.LR \e ; +other metacharacters +have no special meaning and +may appear unescaped. +.PP +A +.L . +matches any character. +.PP +A +.L ^ +matches the beginning of a line; +.L $ +matches the end of the line. +.PP +The +.B REP +operators match zero or more +.RB ( * ), +one or more +.RB ( + ), +zero or one +.RB ( ? ), +instances respectively of the preceding regular expression +.BR e2 . +.PP +A concatenated regular expression, +.BR "e1\|e2" , +matches a match to +.B e1 +followed by a match to +.BR e2 . +.PP +An alternative regular expression, +.BR "e0\||\|e1" , +matches either a match to +.B e0 +or a match to +.BR e1 . +.PP +A match to any part of a regular expression +extends as far as possible without preventing +a match to the remainder of the regular expression. +.SH "SEE ALSO" +.IR regexp9 (3) diff --git a/src/libregexp/regexp9.h b/src/libregexp/regexp9.h new file mode 100644 index 00000000..e25658a3 --- /dev/null +++ b/src/libregexp/regexp9.h @@ -0,0 +1,71 @@ +#ifndef _REGEXP9H_ + +#define _REGEXP9H_ 1 +#include + +typedef struct Resub Resub; +typedef struct Reclass Reclass; +typedef struct Reinst Reinst; +typedef struct Reprog Reprog; + +/* + * Sub expression matches + */ +struct Resub{ + union + { + char *sp; + Rune *rsp; + }s; + union + { + char *ep; + Rune *rep; + }e; +}; + +/* + * character class, each pair of rune's defines a range + */ +struct Reclass{ + Rune *end; + Rune spans[64]; +}; + +/* + * Machine instructions + */ +struct Reinst{ + int type; + union { + Reclass *cp; /* class pointer */ + Rune r; /* character */ + int subid; /* sub-expression id for RBRA and LBRA */ + Reinst *right; /* right child of OR */ + }u1; + union { /* regexp relies on these two being in the same union */ + Reinst *left; /* left child of OR */ + Reinst *next; /* next instruction for CAT & LBRA */ + }u2; +}; + +/* + * Reprogram definition + */ +struct Reprog{ + Reinst *startinst; /* start pc */ + Reclass class[16]; /* .data */ + Reinst firstinst[5]; /* .text */ +}; + +extern Reprog *regcomp(char*); +extern Reprog *regcomplit(char*); +extern Reprog *regcompnl(char*); +extern void regerror(char*); +extern int regexec(Reprog*, char*, Resub*, int); +extern void regsub(char*, char*, int, Resub*, int); + +extern int rregexec(Reprog*, Rune*, Resub*, int); +extern void rregsub(Rune*, Rune*, Resub*, int); + +#endif diff --git a/src/libregexp/regsub.c b/src/libregexp/regsub.c new file mode 100644 index 00000000..6de2c957 --- /dev/null +++ b/src/libregexp/regsub.c @@ -0,0 +1,62 @@ +#include "lib9.h" +#include "regexp9.h" + +/* substitute into one string using the matches from the last regexec() */ +extern void +regsub(char *sp, /* source string */ + char *dp, /* destination string */ + int dlen, + Resub *mp, /* subexpression elements */ + int ms) /* number of elements pointed to by mp */ +{ + char *ssp, *ep; + int i; + + ep = dp+dlen-1; + while(*sp != '\0'){ + if(*sp == '\\'){ + switch(*++sp){ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = *sp-'0'; + if(mp[i].s.sp != 0 && mp!=0 && ms>i) + for(ssp = mp[i].s.sp; + ssp < mp[i].e.ep; + ssp++) + if(dp < ep) + *dp++ = *ssp; + break; + case '\\': + if(dp < ep) + *dp++ = '\\'; + break; + case '\0': + sp--; + break; + default: + if(dp < ep) + *dp++ = *sp; + break; + } + }else if(*sp == '&'){ + if(mp[0].s.sp != 0 && mp!=0 && ms>0) + if(mp[0].s.sp != 0) + for(ssp = mp[0].s.sp; + ssp < mp[0].e.ep; ssp++) + if(dp < ep) + *dp++ = *ssp; + }else + if(dp < ep) + *dp++ = *sp; + sp++; + } + *dp = '\0'; +} diff --git a/src/libregexp/rpm.spec b/src/libregexp/rpm.spec new file mode 100644 index 00000000..f4c92d66 --- /dev/null +++ b/src/libregexp/rpm.spec @@ -0,0 +1,34 @@ +Summary: Simple regular expression library from Plan 9 +Name: libregexp9 +Version: 2.0 +Release: 1 +Group: Development/C +Copyright: Public Domain +Packager: Russ Cox +Source: http://pdos.lcs.mit.edu/~rsc/software/libregexp9-2.0.tgz +URL: http://pdos.lcs.mit.edu/~rsc/software/#libregexp9 +Requires: libfmt libutf + +%description +Libregexp9 is a port of Plan 9's regexp library. +It is small and simple and provides the traditional +extended regular expressions (as opposed to the +current extended regular expressions, which add {} +and various \x character classes, among other +complications). + +http://plan9.bell-labs.com/magic/man2html/2/regexp +%prep +%setup + +%build +make + +%install +make install + +%files +/usr/local/include/regexp9.h +/usr/local/lib/libregexp9.a +/usr/local/man/man3/regexp9.3 +/usr/local/man/man7/regexp9.7 diff --git a/src/libregexp/rregaux.c b/src/libregexp/rregaux.c new file mode 100644 index 00000000..f4cb0068 --- /dev/null +++ b/src/libregexp/rregaux.c @@ -0,0 +1,26 @@ +#include "lib9.h" +#include "regexp9.h" +#include "regcomp.h" + +extern Relist* +_rrenewemptythread(Relist *lp, /* _relist to add to */ + Reinst *ip, /* instruction to add */ + Rune *rsp) /* pointers to subexpressions */ +{ + Relist *p; + + for(p=lp; p->inst; p++){ + if(p->inst == ip){ + if(rsp < p->se.m[0].s.rsp) { + memset((void *)&p->se, 0, sizeof(p->se)); + p->se.m[0].s.rsp = rsp; + } + return 0; + } + } + p->inst = ip; + memset((void *)&p->se, 0, sizeof(p->se)); + p->se.m[0].s.rsp = rsp; + (++p)->inst = 0; + return p; +} diff --git a/src/libregexp/rregexec.c b/src/libregexp/rregexec.c new file mode 100644 index 00000000..e96c9721 --- /dev/null +++ b/src/libregexp/rregexec.c @@ -0,0 +1,213 @@ +#include "lib9.h" +#include "regexp9.h" +#include "regcomp.h" + +/* + * return 0 if no match + * >0 if a match + * <0 if we ran out of _relist space + */ +static int +rregexec1(Reprog *progp, /* program to run */ + Rune *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j) +{ + int flag=0; + Reinst *inst; + Relist *tlp; + Rune *s; + int i, checkstart; + Rune r, *rp, *ep; + Relist* tl; /* This list, next list */ + Relist* nl; + Relist* tle; /* ends of this and next list */ + Relist* nle; + int match; + + match = 0; + checkstart = j->startchar; + if(mp) + for(i=0; irelist[0][0].inst = 0; + j->relist[1][0].inst = 0; + + /* Execute machine once for each character, including terminal NUL */ + s = j->rstarts; + do{ + + /* fast check for first char */ + if(checkstart) { + switch(j->starttype) { + case RUNE: + while(*s != j->startchar) { + if(*s == 0) + return match; + s++; + } + break; + case BOL: + if(s == bol) + break; + while(*s != '\n') { + if(*s == 0) + return match; + s++; + } + break; + } + } + + r = *s; + + /* switch run lists */ + tl = j->relist[flag]; + tle = j->reliste[flag]; + nl = j->relist[flag^=1]; + nle = j->reliste[flag]; + nl->inst = 0; + + /* Add first instruction to current list */ + _rrenewemptythread(tl, progp->startinst, s); + + /* Execute machine until current list is empty */ + for(tlp=tl; tlp->inst; tlp++){ + for(inst=tlp->inst; ; inst = inst->u2.next){ + switch(inst->type){ + case RUNE: /* regular character */ + if(inst->u1.r == r) + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case LBRA: + tlp->se.m[inst->u1.subid].s.rsp = s; + continue; + case RBRA: + tlp->se.m[inst->u1.subid].e.rep = s; + continue; + case ANY: + if(r != '\n') + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case ANYNL: + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case BOL: + if(s == bol || *(s-1) == '\n') + continue; + break; + case EOL: + if(s == j->reol || r == 0 || r == '\n') + continue; + break; + case CCLASS: + ep = inst->u1.cp->end; + for(rp = inst->u1.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]){ + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + } + break; + case NCCLASS: + ep = inst->u1.cp->end; + for(rp = inst->u1.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]) + break; + if(rp == ep) + if(_renewthread(nl, inst->u2.next, &tlp->se)==nle) + return -1; + break; + case OR: + /* evaluate right choice later */ + if(_renewthread(tlp, inst->u1.right, &tlp->se) == tle) + return -1; + /* efficiency: advance and re-evaluate */ + continue; + case END: /* Match! */ + match = 1; + tlp->se.m[0].e.rep = s; + if(mp != 0) + _renewmatch(mp, ms, &tlp->se); + break; + } + break; + } + } + if(s == j->reol) + break; + checkstart = j->startchar && nl->inst==0; + s++; + }while(r); + return match; +} + +static int +rregexec2(Reprog *progp, /* program to run */ + Rune *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + Reljunk *j +) +{ + Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE]; + + /* mark space */ + j->relist[0] = relist0; + j->relist[1] = relist1; + j->reliste[0] = relist0 + nelem(relist0) - 2; + j->reliste[1] = relist1 + nelem(relist1) - 2; + + return rregexec1(progp, bol, mp, ms, j); +} + +extern int +rregexec(Reprog *progp, /* program to run */ + Rune *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms) /* number of elements at mp */ +{ + Reljunk j; + Relist relist0[LISTSIZE], relist1[LISTSIZE]; + int rv; + + /* + * use user-specified starting/ending location if specified + */ + j.rstarts = bol; + j.reol = 0; + if(mp && ms>0){ + if(mp->s.sp) + j.rstarts = mp->s.rsp; + if(mp->e.ep) + j.reol = mp->e.rep; + } + j.starttype = 0; + j.startchar = 0; + if(progp->startinst->type == RUNE && progp->startinst->u1.r < (Rune)Runeself) { + j.starttype = RUNE; + j.startchar = progp->startinst->u1.r; + } + if(progp->startinst->type == BOL) + j.starttype = BOL; + + /* mark space */ + j.relist[0] = relist0; + j.relist[1] = relist1; + j.reliste[0] = relist0 + nelem(relist0) - 2; + j.reliste[1] = relist1 + nelem(relist1) - 2; + + rv = rregexec1(progp, bol, mp, ms, &j); + if(rv >= 0) + return rv; + rv = rregexec2(progp, bol, mp, ms, &j); + if(rv >= 0) + return rv; + return -1; +} diff --git a/src/libregexp/rregsub.c b/src/libregexp/rregsub.c new file mode 100644 index 00000000..15f3c174 --- /dev/null +++ b/src/libregexp/rregsub.c @@ -0,0 +1,55 @@ +#include "lib9.h" +#include "regexp9.h" + +/* substitute into one string using the matches from the last regexec() */ +extern void +rregsub(Rune *sp, /* source string */ + Rune *dp, /* destination string */ + Resub *mp, /* subexpression elements */ + int ms) /* number of elements pointed to by mp */ +{ + Rune *ssp; + int i; + + while(*sp != '\0'){ + if(*sp == '\\'){ + switch(*++sp){ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = *sp-'0'; + if(mp[i].s.rsp != 0 && mp!=0 && ms>i) + for(ssp = mp[i].s.rsp; + ssp < mp[i].e.rep; + ssp++) + *dp++ = *ssp; + break; + case '\\': + *dp++ = '\\'; + break; + case '\0': + sp--; + break; + default: + *dp++ = *sp; + break; + } + }else if(*sp == '&'){ + if(mp[0].s.rsp != 0 && mp!=0 && ms>0) + if(mp[0].s.rsp != 0) + for(ssp = mp[0].s.rsp; + ssp < mp[0].e.rep; ssp++) + *dp++ = *ssp; + }else + *dp++ = *sp; + sp++; + } + *dp = '\0'; +} diff --git a/src/libregexp/test.c b/src/libregexp/test.c new file mode 100644 index 00000000..83533ee6 --- /dev/null +++ b/src/libregexp/test.c @@ -0,0 +1,46 @@ +#include "lib9.h" +#include + +struct x +{ + char *re; + char *s; + Reprog *p; +}; + +struct x t[] = { + { "^[^!@]+$", "/bin/upas/aliasmail '&'", 0 }, + { "^local!(.*)$", "/mail/box/\\1/mbox", 0 }, + { "^plan9!(.*)$", "\\1", 0 }, + { "^helix!(.*)$", "\\1", 0 }, + { "^([^!]+)@([^!@]+)$", "\\2!\\1", 0 }, + { "^(uk\\.[^!]*)(!.*)$", "/bin/upas/uk2uk '\\1' '\\2'", 0 }, + { "^[^!]*\\.[^!]*!.*$", "inet!&", 0 }, + { "^\xE2\x98\xBA$", "smiley", 0 }, + { "^(coma|research|pipe|pyxis|inet|hunny|gauss)!(.*)$", "/mail/lib/qmail '\\s' 'net!\\1' '\\2'", 0 }, + { "^.*$", "/mail/lib/qmail '\\s' 'net!research' '&'", 0 }, + { 0, 0, 0 }, +}; + +main(int ac, char **av) +{ + Resub rs[10]; + char dst[128]; + int n; + struct x *tp; + + for(tp = t; tp->re; tp++) + tp->p = regcomp(tp->re); + + + for(tp = t; tp->re; tp++){ + print("%s VIA %s", av[1], tp->re); + memset(rs, 0, sizeof rs); + if(regexec(tp->p, av[1], rs, 10)){ + regsub(tp->s, dst, sizeof dst, rs, 10); + print(" sub %s -> %s", tp->s, dst); + } + print("\n"); + } + exit(0); +} diff --git a/src/libregexp/test2.c b/src/libregexp/test2.c new file mode 100644 index 00000000..150953e4 --- /dev/null +++ b/src/libregexp/test2.c @@ -0,0 +1,20 @@ +#include "lib9.h" +#include + + +main(int ac, char **av) +{ + Resub rs[10]; + Reprog *p; + char *s; + int i; + + p = regcomp("[^a-z]"); + s = "\n"; + if(regexec(p, s, rs, 10)) + print("%s %lux %lux %lux\n", s, s, rs[0].sp, rs[0].ep); + s = "0"; + if(regexec(p, s, rs, 10)) + print("%s %lux %lux %lux\n", s, s, rs[0].sp, rs[0].ep); + exit(0); +} -- cgit v1.2.3