diff options
author | rsc <devnull@localhost> | 2003-11-25 03:37:45 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2003-11-25 03:37:45 +0000 |
commit | 08708877939323c1e1cb87210193ec25fc472ff7 (patch) | |
tree | bd34e2144a3e9532ab228619d7ae8d4a0078aeeb /src/cmd/dict/dict.c | |
parent | 091f74d0a0db5ba1e098a518922525cb032a97b4 (diff) | |
download | plan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.gz plan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.bz2 plan9port-08708877939323c1e1cb87210193ec25fc472ff7.zip |
add dict
Diffstat (limited to 'src/cmd/dict/dict.c')
-rw-r--r-- | src/cmd/dict/dict.c | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/src/cmd/dict/dict.c b/src/cmd/dict/dict.c new file mode 100644 index 00000000..bafe90d5 --- /dev/null +++ b/src/cmd/dict/dict.c @@ -0,0 +1,681 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <regexp.h> +#include <ctype.h> +#include "dict.h" + +/* + * Assumed index file structure: lines of form + * [^\t]+\t[0-9]+ + * First field is key, second is byte offset into dictionary. + * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2 + */ +typedef struct Addr Addr; + +struct Addr { + int n; /* number of offsets */ + int cur; /* current position within doff array */ + int maxn; /* actual current size of doff array */ + ulong doff[1]; /* doff[maxn], with 0..n-1 significant */ +}; + +Biobuf binbuf; +Biobuf boutbuf; +Biobuf *bin = &binbuf; /* user cmd input */ +Biobuf *bout = &boutbuf; /* output */ +Biobuf *bdict; /* dictionary */ +Biobuf *bindex; /* index file */ +long indextop; /* index offset at end of file */ +int lastcmd; /* last executed command */ +Addr *dot; /* "current" address */ +Dict *dict; /* current dictionary */ +int linelen; +int breaklen = 60; +int outinhibit; +int debug; + +void execcmd(int); +int getpref(char*, Rune*); +Entry getentry(int); +int getfield(Rune*); +long locate(Rune*); +int parseaddr(char*, char**); +int parsecmd(char*); +int search(char*, int); +long seeknextline(Biobuf*, long); +void setdotnext(void); +void setdotprev(void); +void sortaddr(Addr*); +void usage(void); + +enum { + Plen=300, /* max length of a search pattern */ + Fieldlen=200, /* max length of an index field */ + Aslots=10, /* initial number of slots in an address */ +}; + +void +main(int argc, char **argv) +{ + int i, cmd, kflag; + char *line, *p; + + Binit(&binbuf, 0, OREAD); + Binit(&boutbuf, 1, OWRITE); + kflag = 0; + line = 0; + dict = 0; + p = getenv("PLAN9"); + if(p == nil) + p = "/usr/local/plan9"; + if(chdir(p) < 0) + sysfatal("chdir %s: %r", p); + + for(i=0; dicts[i].name; i++){ + if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){ + dict = &dicts[i]; + break; + } + } + ARGBEGIN { + case 'd': + p = ARGF(); + dict = 0; + if(p) { + for(i=0; dicts[i].name; i++) + if(strcmp(p, dicts[i].name)==0) { + dict = &dicts[i]; + break; + } + } + if(!dict) + usage(); + break; + case 'c': + line = ARGF(); + if(!line) + usage(); + break; + case 'k': + kflag++; + break; + case 'D': + debug++; + break; + default: + usage(); + ARGEND } + if(dict == 0){ + err("no dictionaries present on this system"); + exits("nodict"); + } + + if(kflag) { + (*dict->printkey)(); + exits(0); + } + if(argc > 1) + usage(); + else if(argc == 1) { + if(line) + usage(); + p = argv[0]; + line = malloc(strlen(p)+5); + sprint(line, "/%s/P\n", p); + } + bdict = Bopen(dict->path, OREAD); + if(!bdict) { + err("can't open dictionary %s/%s", p, dict->path); + exits("nodict"); + } + bindex = Bopen(dict->indexpath, OREAD); + if(!bindex) { + err("can't open index %s/%s", p, dict->indexpath); + exits("noindex"); + } + indextop = Bseek(bindex, 0L, 2); + + dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong)); + dot->n = 0; + dot->cur = 0; + dot->maxn = Aslots; + lastcmd = 0; + + if(line) { + cmd = parsecmd(line); + if(cmd) + execcmd(cmd); + } else { + for(;;) { + Bprint(bout, "*"); + Bflush(bout); + line = Brdline(bin, '\n'); + linelen = 0; + if(!line) + break; + cmd = parsecmd(line); + if(cmd) { + execcmd(cmd); + lastcmd = cmd; + } + } + } + exits(0); +} + +void +usage(void) +{ + int i; + char *a, *b; + + Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0); + Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n"); + for(i = 0; dicts[i].name; i++){ + a = b = ""; + if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){ + a = "["; + b = "]"; + } + Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b); + } + exits("usage"); +} + +int +parsecmd(char *line) +{ + char *e; + int cmd, ans; + + if(parseaddr(line, &e) >= 0) + line = e; + else + return 0; + cmd = *line; + ans = cmd; + if(isupper(cmd)) + cmd = tolower(cmd); + if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' || + cmd == '\n')) { + err("unknown command %c", cmd); + return 0; + } + if(cmd == '\n') + switch(lastcmd) { + case 0: ans = 'H'; break; + case 'H': ans = 'p'; break; + default : ans = lastcmd; break; + } + else if(line[1] != '\n' && line[1] != 0) + err("extra stuff after command %c ignored", cmd); + return ans; +} + +void +execcmd(int cmd) +{ + Entry e; + int cur, doall; + + if(isupper(cmd)) { + doall = 1; + cmd = tolower(cmd); + cur = 0; + } else { + doall = 0; + cur = dot->cur; + } + + if(debug && doall && cmd == 'a') + Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1); + for(;;){ + if(cur >= dot->n) + break; + if(doall) { + Bprint(bout, "%d\t", cur+1); + linelen += 4 + (cur >= 10); + } + switch(cmd) { + case 'a': + Bprint(bout, "#%lud\n", dot->doff[cur]); + break; + case 'h': + case 'p': + case 'r': + e = getentry(cur); + (*dict->printentry)(e, cmd); + break; + } + cur++; + if(doall) { + if(cmd == 'p' || cmd == 'r') { + Bputc(bout, '\n'); + linelen = 0; + } + } else + break; + } + if(cur >= dot->n) + cur = 0; + dot->cur = cur; +} + +/* + * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')* + * Answer goes in dot. + * Return -1 if address starts, but get error. + * Return 0 if no address. + */ +int +parseaddr(char *line, char **eptr) +{ + int delim, plen; + ulong v; + char *e; + char pat[Plen]; + + if(*line == '/' || *line == '!') { + /* anchored regular expression match; '!' means no folding */ + if(*line == '/') { + delim = '/'; + e = strpbrk(line+1, "/\n"); + } else { + delim = '!'; + e = strpbrk(line+1, "!\n"); + } + plen = e-line-1; + if(plen >= Plen-3) { + err("pattern too big"); + return -1; + } + pat[0] = '^'; + memcpy(pat+1, line+1, plen); + pat[plen+1] = '$'; + pat[plen+2] = 0; + if(*e == '\n') + line = e; + else + line = e+1; + if(!search(pat, delim == '/')) { + err("pattern not found"); + return -1; + } + } else if(*line == '#') { + /* absolute byte offset into dictionary */ + line++; + if(!isdigit(*line)) + return -1; + v = strtoul(line, &e, 10); + line = e; + dot->doff[0] = v; + dot->n = 1; + dot->cur = 0; + } else if(isdigit(*line)) { + v = strtoul(line, &e, 10); + line = e; + if(v < 1 || v > dot->n) + err(".%d not in range [1,%d], ignored", + v, dot->n); + else + dot->cur = v-1; + } else if(*line == '.') { + line++; + } else { + *eptr = line; + return 0; + } + while(*line == '+' || *line == '-') { + if(*line == '+') + setdotnext(); + else + setdotprev(); + line++; + } + *eptr = line; + return 1; +} + +/* + * Index file is sorted by folded field1. + * Method: find pre, a folded prefix of r.e. pat, + * and then low = offset to beginning of + * line in index file where first match of prefix occurs. + * Then go through index until prefix no longer matches, + * adding each line that matches real pattern to dot. + * Finally, sort dot offsets (uniquing). + * We know pat len < Plen, and that it is surrounded by ^..$ + */ +int +search(char *pat, int dofold) +{ + int needre, prelen, match, n; + Reprog *re; + long ioff, v; + Rune pre[Plen]; + Rune lit[Plen]; + Rune entry[Fieldlen]; + char fpat[Plen]; + + prelen = getpref(pat+1, pre); + if(pat[prelen+1] == 0 || pat[prelen+1] == '$') { + runescpy(lit, pre); + if(dofold) + fold(lit); + needre = 0; + SET(re); + } else { + needre = 1; + if(dofold) { + foldre(fpat, pat); + re = regcomp(fpat); + } else + re = regcomp(pat); + } + fold(pre); + ioff = locate(pre); + if(ioff < 0) + return 0; + dot->n = 0; + Bseek(bindex, ioff, 0); + for(;;) { + if(!getfield(entry)) + break; + if(dofold) + fold(entry); + if(needre) + match = rregexec(re, entry, 0, 0); + else + match = (acomp(lit, entry) == 0); + if(match) { + if(!getfield(entry)) + break; + v = runetol(entry); + if(dot->n >= dot->maxn) { + n = 2*dot->maxn; + dot = realloc(dot, + sizeof(Addr)+(n-1)*sizeof(long)); + if(!dot) { + err("out of memory"); + exits("nomem"); + } + dot->maxn = n; + } + dot->doff[dot->n++] = v; + } else { + if(!dofold) + fold(entry); + if(*pre) { + n = acomp(pre, entry); + if(n < -1 || (!needre && n < 0)) + break; + } + /* get to next index entry */ + if(!getfield(entry)) + break; + } + } + sortaddr(dot); + dot->cur = 0; + return dot->n; +} + +/* + * Return offset in index file of first line whose folded + * first field has pre as a prefix. -1 if none found. + */ +long +locate(Rune *pre) +{ + long top, bot, mid; + Rune entry[Fieldlen]; + + if(*pre == 0) + return 0; + bot = 0; + top = indextop; + if(debug>1) + fprint(2, "locate looking for prefix %S\n", pre); + for(;;) { + /* + * Loop invariant: foldkey(bot) < pre <= foldkey(top) + * and bot < top, and bot,top point at beginning of lines + */ + mid = (top+bot) / 2; + mid = seeknextline(bindex, mid); + if(debug > 1) + fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n", + bot, (top+bot) / 2, mid, top); + if(mid == top || !getfield(entry)) + break; + if(debug > 1) + fprint(2, "key=%S\n", entry); + /* + * here mid is strictly between bot and top + */ + fold(entry); + if(acomp(pre, entry) <= 0) + top = mid; + else + bot = mid; + } + /* + * bot < top, but they don't necessarily point at successive lines + * Use linear search from bot to find first line that pre is a + * prefix of + */ + while((bot = seeknextline(bindex, bot)) <= top) { + if(!getfield(entry)) + return -1; + if(debug > 1) + fprint(2, "key=%S\n", entry); + fold(entry); + switch(acomp(pre, entry)) { + case -2: + return -1; + case -1: + case 0: + return bot; + case 1: + case 2: + continue; + } + } + return -1; + +} + +/* + * Get prefix of non re-metacharacters, runified, into pre, + * and return length + */ +int +getpref(char *pat, Rune *pre) +{ + int n, r; + char *p; + + p = pat; + while(*p) { + n = chartorune(pre, p); + r = *pre; + switch(r) { + case 0x2e: case 0x2a: case 0x2b: case 0x3f: + case 0x5b: case 0x5d: case 0x28: case ')': + case 0x7c: case 0x5e: case 0x24: + *pre = 0; + return p-pat; + case L'\\': + p += n; + p += chartorune(++pre, p); + pre++; + break; + default: + p += n; + pre++; + } + } + return p-pat; +} + +long +seeknextline(Biobuf *b, long off) +{ + long c; + + Bseek(b, off, 0); + do { + c = Bgetrune(b); + } while(c>=0 && c!='\n'); + return Boffset(b); +} + +/* + * Get next field out of index file (either tab- or nl- terminated) + * Answer in *rp, assumed to be Fieldlen long. + * Return 0 if read error first. + */ +int +getfield(Rune *rp) +{ + long c; + int n; + + for(n=Fieldlen; n-- > 0; ) { + if ((c = Bgetrune(bindex)) < 0) + return 0; + if(c == '\t' || c == '\n') { + *rp = L'\0'; + return 1; + } + *rp++ = c; + } + err("word too long"); + return 0; +} + +/* + * A compare longs function suitable for qsort + */ +static int +longcmp(const void *av, const void *bv) +{ + long v; + long *a, *b; + + a = (long*)av; + b = (long*)bv; + + v = *a - *b; + if(v < 0) + return -1; + else if(v == 0) + return 0; + else + return 1; +} + +void +sortaddr(Addr *a) +{ + int i, j; + long v; + + if(a->n <= 1) + return; + + qsort(a->doff, a->n, sizeof(long), longcmp); + + /* remove duplicates */ + for(i=0, j=0; j < a->n; j++) { + v = a->doff[j]; + if(i > 0 && v == a->doff[i-1]) + continue; + a->doff[i++] = v; + } + a->n = i; +} + +Entry +getentry(int i) +{ + long b, e, n; + static Entry ans; + static int anslen = 0; + + b = dot->doff[i]; + e = (*dict->nextoff)(b+1); + ans.doff = b; + if(e < 0) { + err("couldn't seek to entry"); + ans.start = 0; + ans.end = 0; + } else { + n = e-b; + if(n+1 > anslen) { + ans.start = realloc(ans.start, n+1); + if(!ans.start) { + err("out of memory"); + exits("nomem"); + } + anslen = n+1; + } + Bseek(bdict, b, 0); + n = Bread(bdict, ans.start, n); + ans.end = ans.start + n; + *ans.end = 0; + } + return ans; +} + +void +setdotnext(void) +{ + long b; + + b = (*dict->nextoff)(dot->doff[dot->cur]+1); + if(b < 0) { + err("couldn't find a next entry"); + return; + } + dot->doff[0] = b; + dot->n = 1; + dot->cur = 0; +} + +void +setdotprev(void) +{ + int tryback; + long here, last, p; + + if(dot->cur < 0 || dot->cur >= dot->n) + return; + tryback = 2000; + here = dot->doff[dot->cur]; + last = 0; + while(last == 0) { + p = here - tryback; + if(p < 0) + p = 0; + for(;;) { + p = (*dict->nextoff)(p+1); + if(p < 0) + return; /* shouldn't happen */ + if(p >= here) + break; + last = p; + } + if(!last) { + if(here - tryback < 0) { + err("can't find a previous entry"); + return; + } + tryback = 2*tryback; + } + } + dot->doff[0] = last; + dot->n = 1; + dot->cur = 0; +} |