From b3994ec5c78e6c18885079b58abb7fb997899c3f Mon Sep 17 00:00:00 2001 From: rsc Date: Thu, 11 Dec 2003 17:50:28 +0000 Subject: More files related to user-level file servers. Also add acme! --- src/cmd/acme/regx.c | 835 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 835 insertions(+) create mode 100644 src/cmd/acme/regx.c (limited to 'src/cmd/acme/regx.c') diff --git a/src/cmd/acme/regx.c b/src/cmd/acme/regx.c new file mode 100644 index 00000000..f934187b --- /dev/null +++ b/src/cmd/acme/regx.c @@ -0,0 +1,835 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dat.h" +#include "fns.h" + +Rangeset sel; +Rune *lastregexp; + +/* + * Machine Information + */ +typedef struct Inst Inst; +struct Inst +{ + uint type; /* < 0x10000 ==> literal, otherwise action */ + union { + int sid; + int subid; + int class; + Inst *other; + Inst *right; + } u; + union{ + Inst *left; + Inst *next; + } u1; +}; + +#define NPROG 1024 +Inst program[NPROG]; +Inst *progp; +Inst *startinst; /* First inst. of program; might not be program[0] */ +Inst *bstartinst; /* same for backwards machine */ +Channel *rechan; /* chan(Inst*) */ + +typedef struct Ilist Ilist; +struct Ilist +{ + Inst *inst; /* Instruction of the thread */ + Rangeset se; + uint startp; /* first char of match */ +}; + +#define NLIST 128 + +Ilist *tl, *nl; /* This list, next list */ +Ilist list[2][NLIST]; +static Rangeset sempty; + +/* + * Actions and Tokens + * + * 0x100xx are operators, value == precedence + * 0x200xx are tokens, i.e. operands for operators + */ +#define OPERATOR 0x10000 /* Bitmask of all operators */ +#define START 0x10000 /* Start, used for marker on stack */ +#define RBRA 0x10001 /* Right bracket, ) */ +#define LBRA 0x10002 /* Left bracket, ( */ +#define OR 0x10003 /* Alternation, | */ +#define CAT 0x10004 /* Concatentation, implicit operator */ +#define STAR 0x10005 /* Closure, * */ +#define PLUS 0x10006 /* a+ == aa* */ +#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ +#define ANY 0x20000 /* Any character but newline, . */ +#define NOP 0x20001 /* No operation, internal use only */ +#define BOL 0x20002 /* Beginning of line, ^ */ +#define EOL 0x20003 /* End of line, $ */ +#define CCLASS 0x20004 /* Character class, [] */ +#define NCCLASS 0x20005 /* Negated character class, [^] */ +#define END 0x20077 /* Terminate: match found */ + +#define ISATOR 0x10000 +#define ISAND 0x20000 + +/* + * Parser Information + */ +typedef struct Node Node; +struct Node +{ + Inst *first; + Inst *last; +}; + +#define NSTACK 20 +Node andstack[NSTACK]; +Node *andp; +int atorstack[NSTACK]; +int *atorp; +int lastwasand; /* Last token was operand */ +int cursubid; +int subidstack[NSTACK]; +int *subidp; +int backwards; +int nbra; +Rune *exprp; /* pointer to next character in source expression */ +#define DCLASS 10 /* allocation increment */ +int nclass; /* number active */ +int Nclass; /* high water mark */ +Rune **class; +int negateclass; + +void addinst(Ilist *l, Inst *inst, Rangeset *sep); +void newmatch(Rangeset*); +void bnewmatch(Rangeset*); +void pushand(Inst*, Inst*); +void pushator(int); +Node *popand(int); +int popator(void); +void startlex(Rune*); +int lex(void); +void operator(int); +void operand(int); +void evaluntil(int); +void optimize(Inst*); +void bldcclass(void); + +void +rxinit(void) +{ + rechan = chancreate(sizeof(Inst*), 0); + lastregexp = runemalloc(1); +} + +void +regerror(char *e) +{ + lastregexp[0] = 0; + warning(nil, "regexp: %s\n", e); + sendp(rechan, nil); + threadexits(nil); +} + +Inst * +newinst(int t) +{ + if(progp >= &program[NPROG]) + regerror("expression too long"); + progp->type = t; + progp->u1.left = nil; + progp->u.right = nil; + return progp++; +} + +void +realcompile(void *arg) +{ + int token; + Rune *s; + + threadsetname("regcomp"); + s = arg; + startlex(s); + atorp = atorstack; + andp = andstack; + subidp = subidstack; + cursubid = 0; + lastwasand = FALSE; + /* Start with a low priority operator to prime parser */ + pushator(START-1); + while((token=lex()) != END){ + if((token&ISATOR) == OPERATOR) + operator(token); + else + operand(token); + } + /* Close with a low priority operator */ + evaluntil(START); + /* Force END */ + operand(END); + evaluntil(START); + if(nbra) + regerror("unmatched `('"); + --andp; /* points to first and only operand */ + sendp(rechan, andp->first); + threadexits(nil); +} + +/* r is null terminated */ +int +rxcompile(Rune *r) +{ + int i, nr; + Inst *oprogp; + + nr = runestrlen(r)+1; + if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE) + return TRUE; + lastregexp[0] = 0; + for(i=0; itype = NCCLASS; /* UGH */ + i->u.class = nclass-1; /* UGH */ + } + pushand(i, i); + lastwasand = TRUE; +} + +void +operator(int t) +{ + if(t==RBRA && --nbra<0) + regerror("unmatched `)'"); + if(t==LBRA){ + cursubid++; /* silently ignored */ + nbra++; + if(lastwasand) + operator(CAT); + }else + evaluntil(t); + if(t!=RBRA) + pushator(t); + lastwasand = FALSE; + if(t==STAR || t==QUEST || t==PLUS || t==RBRA) + lastwasand = TRUE; /* these look like operands */ +} + +void +pushand(Inst *f, Inst *l) +{ + if(andp >= &andstack[NSTACK]) + error("operand stack overflow"); + andp->first = f; + andp->last = l; + andp++; +} + +void +pushator(int t) +{ + if(atorp >= &atorstack[NSTACK]) + error("operator stack overflow"); + *atorp++=t; + if(cursubid >= NRange) + *subidp++= -1; + else + *subidp++=cursubid; +} + +Node * +popand(int op) +{ + char buf[64]; + + if(andp <= &andstack[0]) + if(op){ + sprint(buf, "missing operand for %c", op); + regerror(buf); + }else + regerror("malformed regexp"); + return --andp; +} + +int +popator() +{ + if(atorp <= &atorstack[0]) + error("operator stack underflow"); + --subidp; + return *--atorp; +} + +void +evaluntil(int pri) +{ + Node *op1, *op2, *t; + Inst *inst1, *inst2; + + while(pri==RBRA || atorp[-1]>=pri){ + switch(popator()){ + case LBRA: + op1 = popand('('); + inst2 = newinst(RBRA); + inst2->u.subid = *subidp; + op1->last->u1.next = inst2; + inst1 = newinst(LBRA); + inst1->u.subid = *subidp; + inst1->u1.next = op1->first; + pushand(inst1, inst2); + return; /* must have been RBRA */ + default: + error("unknown regexp operator"); + break; + case OR: + op2 = popand('|'); + op1 = popand('|'); + inst2 = newinst(NOP); + op2->last->u1.next = inst2; + op1->last->u1.next = inst2; + inst1 = newinst(OR); + inst1->u.right = op1->first; + inst1->u1.left = op2->first; + pushand(inst1, inst2); + break; + case CAT: + op2 = popand(0); + op1 = popand(0); + if(backwards && op2->first->type!=END){ + t = op1; + op1 = op2; + op2 = t; + } + op1->last->u1.next = op2->first; + pushand(op1->first, op2->last); + break; + case STAR: + op2 = popand('*'); + inst1 = newinst(OR); + op2->last->u1.next = inst1; + inst1->u.right = op2->first; + pushand(inst1, inst1); + break; + case PLUS: + op2 = popand('+'); + inst1 = newinst(OR); + op2->last->u1.next = inst1; + inst1->u.right = op2->first; + pushand(op2->first, inst1); + break; + case QUEST: + op2 = popand('?'); + inst1 = newinst(OR); + inst2 = newinst(NOP); + inst1->u1.left = inst2; + inst1->u.right = op2->first; + op2->last->u1.next = inst2; + pushand(inst1, inst2); + break; + } + } +} + + +void +optimize(Inst *start) +{ + Inst *inst, *target; + + for(inst=start; inst->type!=END; inst++){ + target = inst->u1.next; + while(target->type == NOP) + target = target->u1.next; + inst->u1.next = target; + } +} + +void +startlex(Rune *s) +{ + exprp = s; + nbra = 0; +} + + +int +lex(void){ + int c; + + c = *exprp++; + switch(c){ + case '\\': + if(*exprp) + if((c= *exprp++)=='n') + c='\n'; + break; + case 0: + c = END; + --exprp; /* In case we come here again */ + break; + case '*': + c = STAR; + break; + case '?': + c = QUEST; + break; + case '+': + c = PLUS; + break; + case '|': + c = OR; + break; + case '.': + c = ANY; + break; + case '(': + c = LBRA; + break; + case ')': + c = RBRA; + break; + case '^': + c = BOL; + break; + case '$': + c = EOL; + break; + case '[': + c = CCLASS; + bldcclass(); + break; + } + return c; +} + +int +nextrec(void) +{ + if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) + regerror("malformed `[]'"); + if(exprp[0] == '\\'){ + exprp++; + if(*exprp=='n'){ + exprp++; + return '\n'; + } + return *exprp++|0x10000; + } + return *exprp++; +} + +void +bldcclass(void) +{ + int c1, c2, n, na; + Rune *classp; + + classp = runemalloc(DCLASS); + n = 0; + na = DCLASS; + /* we have already seen the '[' */ + if(*exprp == '^'){ + classp[n++] = '\n'; /* don't match newline in negate case */ + negateclass = TRUE; + exprp++; + }else + negateclass = FALSE; + while((c1 = nextrec()) != ']'){ + if(c1 == '-'){ + Error: + free(classp); + regerror("malformed `[]'"); + } + if(n+4 >= na){ /* 3 runes plus NUL */ + na += DCLASS; + classp = runerealloc(classp, na); + } + if(*exprp == '-'){ + exprp++; /* eat '-' */ + if((c2 = nextrec()) == ']') + goto Error; + classp[n+0] = 0xFFFF; + classp[n+1] = c1; + classp[n+2] = c2; + n += 3; + }else + classp[n++] = c1; + } + classp[n] = 0; + if(nclass == Nclass){ + Nclass += DCLASS; + class = realloc(class, Nclass*sizeof(Rune*)); + } + class[nclass++] = classp; +} + +int +classmatch(int classno, int c, int negate) +{ + Rune *p; + + p = class[classno]; + while(*p){ + if(*p == 0xFFFF){ + if(p[1]<=c && c<=p[2]) + return !negate; + p += 3; + }else if(*p++ == c) + return !negate; + } + return negate; +} + +/* + * Note optimization in addinst: + * *l must be pending when addinst called; if *l has been looked + * at already, the optimization is a bug. + */ +void +addinst(Ilist *l, Inst *inst, Rangeset *sep) +{ + Ilist *p; + + for(p = l; p->inst; p++){ + if(p->inst==inst){ + if((sep)->r[0].q0 < p->se.r[0].q0) + p->se= *sep; /* this would be bug */ + return; /* It's already there */ + } + } + p->inst = inst; + p->se= *sep; + (p+1)->inst = nil; +} + +int +rxnull(void) +{ + return startinst==nil || bstartinst==nil; +} + +/* either t!=nil or r!=nil, and we match the string in the appropriate place */ +int +rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp) +{ + int flag; + Inst *inst; + Ilist *tlp; + uint p; + int nnl, ntl; + int nc, c; + int wrapped; + int startchar; + + flag = 0; + p = startp; + startchar = 0; + wrapped = 0; + nnl = 0; + if(startinst->typetype; + list[0][0].inst = list[1][0].inst = nil; + sel.r[0].q0 = -1; + if(t != nil) + nc = t->file->b.nc; + else + nc = runestrlen(r); + /* Execute machine once for each character */ + for(;;p++){ + doloop: + if(p>=eof || p>=nc){ + switch(wrapped++){ + case 0: /* let loop run one more click */ + case 2: + break; + case 1: /* expired; wrap to beginning */ + if(sel.r[0].q0>=0 || eof!=Infinity) + goto Return; + list[0][0].inst = list[1][0].inst = nil; + p = 0; + goto doloop; + default: + goto Return; + } + c = 0; + }else{ + if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0) + break; + if(t != nil) + c = textreadc(t, p); + else + c = r[p]; + } + /* fast check for first char */ + if(startchar && nnl==0 && c!=startchar) + continue; + tl = list[flag]; + nl = list[flag^=1]; + nl->inst = nil; + ntl = nnl; + nnl = 0; + if(sel.r[0].q0<0 && (!wrapped || p= NLIST){ + Overflow: + warning(nil, "regexp list overflow\n"); + sel.r[0].q0 = -1; + goto Return; + } + sempty.r[0].q0 = p; + addinst(tl, startinst, &sempty); + } + /* Execute machine until this list is empty */ + for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ + Switchstmt: + switch(inst->type){ + default: /* regular character */ + if(inst->type==c){ + Addinst: + if(++nnl >= NLIST) + goto Overflow; + addinst(nl, inst->u1.next, &tlp->se); + } + break; + case LBRA: + if(inst->u.subid>=0) + tlp->se.r[inst->u.subid].q0 = p; + inst = inst->u1.next; + goto Switchstmt; + case RBRA: + if(inst->u.subid>=0) + tlp->se.r[inst->u.subid].q1 = p; + inst = inst->u1.next; + goto Switchstmt; + case ANY: + if(c!='\n') + goto Addinst; + break; + case BOL: + if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){ + Step: + inst = inst->u1.next; + goto Switchstmt; + } + break; + case EOL: + if(c == '\n') + goto Step; + break; + case CCLASS: + if(c>=0 && classmatch(inst->u.class, c, 0)) + goto Addinst; + break; + case NCCLASS: + if(c>=0 && classmatch(inst->u.class, c, 1)) + goto Addinst; + break; + case OR: + /* evaluate right choice later */ + if(++ntl >= NLIST) + goto Overflow; + addinst(tlp, inst->u.right, &tlp->se); + /* efficiency: advance and re-evaluate */ + inst = inst->u1.left; + goto Switchstmt; + case END: /* Match! */ + tlp->se.r[0].q1 = p; + newmatch(&tlp->se); + break; + } + } + } + Return: + *rp = sel; + return sel.r[0].q0 >= 0; +} + +void +newmatch(Rangeset *sp) +{ + if(sel.r[0].q0<0 || sp->r[0].q0r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1)) + sel = *sp; +} + +int +rxbexecute(Text *t, uint startp, Rangeset *rp) +{ + int flag; + Inst *inst; + Ilist *tlp; + int p; + int nnl, ntl; + int c; + int wrapped; + int startchar; + + flag = 0; + nnl = 0; + wrapped = 0; + p = startp; + startchar = 0; + if(bstartinst->typetype; + list[0][0].inst = list[1][0].inst = nil; + sel.r[0].q0= -1; + /* Execute machine once for each character, including terminal NUL */ + for(;;--p){ + doloop: + if(p <= 0){ + switch(wrapped++){ + case 0: /* let loop run one more click */ + case 2: + break; + case 1: /* expired; wrap to end */ + if(sel.r[0].q0>=0) + goto Return; + list[0][0].inst = list[1][0].inst = nil; + p = t->file->b.nc; + goto doloop; + case 3: + default: + goto Return; + } + c = 0; + }else{ + if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0) + break; + c = textreadc(t, p-1); + } + /* fast check for first char */ + if(startchar && nnl==0 && c!=startchar) + continue; + tl = list[flag]; + nl = list[flag^=1]; + nl->inst = nil; + ntl = nnl; + nnl = 0; + if(sel.r[0].q0<0 && (!wrapped || p>startp)){ + /* Add first instruction to this list */ + if(++ntl >= NLIST){ + Overflow: + warning(nil, "regexp list overflow\n"); + sel.r[0].q0 = -1; + goto Return; + } + /* the minus is so the optimizations in addinst work */ + sempty.r[0].q0 = -p; + addinst(tl, bstartinst, &sempty); + } + /* Execute machine until this list is empty */ + for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ + Switchstmt: + switch(inst->type){ + default: /* regular character */ + if(inst->type == c){ + Addinst: + if(++nnl >= NLIST) + goto Overflow; + addinst(nl, inst->u1.next, &tlp->se); + } + break; + case LBRA: + if(inst->u.subid>=0) + tlp->se.r[inst->u.subid].q0 = p; + inst = inst->u1.next; + goto Switchstmt; + case RBRA: + if(inst->u.subid >= 0) + tlp->se.r[inst->u.subid].q1 = p; + inst = inst->u1.next; + goto Switchstmt; + case ANY: + if(c != '\n') + goto Addinst; + break; + case BOL: + if(c=='\n' || p==0){ + Step: + inst = inst->u1.next; + goto Switchstmt; + } + break; + case EOL: + if(pfile->b.nc && textreadc(t, p)=='\n') + goto Step; + break; + case CCLASS: + if(c>0 && classmatch(inst->u.class, c, 0)) + goto Addinst; + break; + case NCCLASS: + if(c>0 && classmatch(inst->u.class, c, 1)) + goto Addinst; + break; + case OR: + /* evaluate right choice later */ + if(++ntl >= NLIST) + goto Overflow; + addinst(tlp, inst->u.right, &tlp->se); + /* efficiency: advance and re-evaluate */ + inst = inst->u1.left; + goto Switchstmt; + case END: /* Match! */ + tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ + tlp->se.r[0].q1 = p; + bnewmatch(&tlp->se); + break; + } + } + } + Return: + *rp = sel; + return sel.r[0].q0 >= 0; +} + +void +bnewmatch(Rangeset *sp) +{ + int i; + + if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1r[i].q1; + sel.r[i].q1 = sp->r[i].q0; + } +} -- cgit v1.2.3