aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/dict/dict.c
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2003-11-25 03:37:45 +0000
committerrsc <devnull@localhost>2003-11-25 03:37:45 +0000
commit08708877939323c1e1cb87210193ec25fc472ff7 (patch)
treebd34e2144a3e9532ab228619d7ae8d4a0078aeeb /src/cmd/dict/dict.c
parent091f74d0a0db5ba1e098a518922525cb032a97b4 (diff)
downloadplan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.gz
plan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.bz2
plan9port-08708877939323c1e1cb87210193ec25fc472ff7.zip
add dict
Diffstat (limited to 'src/cmd/dict/dict.c')
-rw-r--r--src/cmd/dict/dict.c681
1 files changed, 681 insertions, 0 deletions
diff --git a/src/cmd/dict/dict.c b/src/cmd/dict/dict.c
new file mode 100644
index 00000000..bafe90d5
--- /dev/null
+++ b/src/cmd/dict/dict.c
@@ -0,0 +1,681 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include <ctype.h>
+#include "dict.h"
+
+/*
+ * Assumed index file structure: lines of form
+ * [^\t]+\t[0-9]+
+ * First field is key, second is byte offset into dictionary.
+ * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
+ */
+typedef struct Addr Addr;
+
+struct Addr {
+ int n; /* number of offsets */
+ int cur; /* current position within doff array */
+ int maxn; /* actual current size of doff array */
+ ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
+};
+
+Biobuf binbuf;
+Biobuf boutbuf;
+Biobuf *bin = &binbuf; /* user cmd input */
+Biobuf *bout = &boutbuf; /* output */
+Biobuf *bdict; /* dictionary */
+Biobuf *bindex; /* index file */
+long indextop; /* index offset at end of file */
+int lastcmd; /* last executed command */
+Addr *dot; /* "current" address */
+Dict *dict; /* current dictionary */
+int linelen;
+int breaklen = 60;
+int outinhibit;
+int debug;
+
+void execcmd(int);
+int getpref(char*, Rune*);
+Entry getentry(int);
+int getfield(Rune*);
+long locate(Rune*);
+int parseaddr(char*, char**);
+int parsecmd(char*);
+int search(char*, int);
+long seeknextline(Biobuf*, long);
+void setdotnext(void);
+void setdotprev(void);
+void sortaddr(Addr*);
+void usage(void);
+
+enum {
+ Plen=300, /* max length of a search pattern */
+ Fieldlen=200, /* max length of an index field */
+ Aslots=10, /* initial number of slots in an address */
+};
+
+void
+main(int argc, char **argv)
+{
+ int i, cmd, kflag;
+ char *line, *p;
+
+ Binit(&binbuf, 0, OREAD);
+ Binit(&boutbuf, 1, OWRITE);
+ kflag = 0;
+ line = 0;
+ dict = 0;
+ p = getenv("PLAN9");
+ if(p == nil)
+ p = "/usr/local/plan9";
+ if(chdir(p) < 0)
+ sysfatal("chdir %s: %r", p);
+
+ for(i=0; dicts[i].name; i++){
+ if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
+ dict = &dicts[i];
+ break;
+ }
+ }
+ ARGBEGIN {
+ case 'd':
+ p = ARGF();
+ dict = 0;
+ if(p) {
+ for(i=0; dicts[i].name; i++)
+ if(strcmp(p, dicts[i].name)==0) {
+ dict = &dicts[i];
+ break;
+ }
+ }
+ if(!dict)
+ usage();
+ break;
+ case 'c':
+ line = ARGF();
+ if(!line)
+ usage();
+ break;
+ case 'k':
+ kflag++;
+ break;
+ case 'D':
+ debug++;
+ break;
+ default:
+ usage();
+ ARGEND }
+ if(dict == 0){
+ err("no dictionaries present on this system");
+ exits("nodict");
+ }
+
+ if(kflag) {
+ (*dict->printkey)();
+ exits(0);
+ }
+ if(argc > 1)
+ usage();
+ else if(argc == 1) {
+ if(line)
+ usage();
+ p = argv[0];
+ line = malloc(strlen(p)+5);
+ sprint(line, "/%s/P\n", p);
+ }
+ bdict = Bopen(dict->path, OREAD);
+ if(!bdict) {
+ err("can't open dictionary %s/%s", p, dict->path);
+ exits("nodict");
+ }
+ bindex = Bopen(dict->indexpath, OREAD);
+ if(!bindex) {
+ err("can't open index %s/%s", p, dict->indexpath);
+ exits("noindex");
+ }
+ indextop = Bseek(bindex, 0L, 2);
+
+ dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
+ dot->n = 0;
+ dot->cur = 0;
+ dot->maxn = Aslots;
+ lastcmd = 0;
+
+ if(line) {
+ cmd = parsecmd(line);
+ if(cmd)
+ execcmd(cmd);
+ } else {
+ for(;;) {
+ Bprint(bout, "*");
+ Bflush(bout);
+ line = Brdline(bin, '\n');
+ linelen = 0;
+ if(!line)
+ break;
+ cmd = parsecmd(line);
+ if(cmd) {
+ execcmd(cmd);
+ lastcmd = cmd;
+ }
+ }
+ }
+ exits(0);
+}
+
+void
+usage(void)
+{
+ int i;
+ char *a, *b;
+
+ Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
+ Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
+ for(i = 0; dicts[i].name; i++){
+ a = b = "";
+ if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
+ a = "[";
+ b = "]";
+ }
+ Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
+ }
+ exits("usage");
+}
+
+int
+parsecmd(char *line)
+{
+ char *e;
+ int cmd, ans;
+
+ if(parseaddr(line, &e) >= 0)
+ line = e;
+ else
+ return 0;
+ cmd = *line;
+ ans = cmd;
+ if(isupper(cmd))
+ cmd = tolower(cmd);
+ if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
+ cmd == '\n')) {
+ err("unknown command %c", cmd);
+ return 0;
+ }
+ if(cmd == '\n')
+ switch(lastcmd) {
+ case 0: ans = 'H'; break;
+ case 'H': ans = 'p'; break;
+ default : ans = lastcmd; break;
+ }
+ else if(line[1] != '\n' && line[1] != 0)
+ err("extra stuff after command %c ignored", cmd);
+ return ans;
+}
+
+void
+execcmd(int cmd)
+{
+ Entry e;
+ int cur, doall;
+
+ if(isupper(cmd)) {
+ doall = 1;
+ cmd = tolower(cmd);
+ cur = 0;
+ } else {
+ doall = 0;
+ cur = dot->cur;
+ }
+
+ if(debug && doall && cmd == 'a')
+ Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
+ for(;;){
+ if(cur >= dot->n)
+ break;
+ if(doall) {
+ Bprint(bout, "%d\t", cur+1);
+ linelen += 4 + (cur >= 10);
+ }
+ switch(cmd) {
+ case 'a':
+ Bprint(bout, "#%lud\n", dot->doff[cur]);
+ break;
+ case 'h':
+ case 'p':
+ case 'r':
+ e = getentry(cur);
+ (*dict->printentry)(e, cmd);
+ break;
+ }
+ cur++;
+ if(doall) {
+ if(cmd == 'p' || cmd == 'r') {
+ Bputc(bout, '\n');
+ linelen = 0;
+ }
+ } else
+ break;
+ }
+ if(cur >= dot->n)
+ cur = 0;
+ dot->cur = cur;
+}
+
+/*
+ * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
+ * Answer goes in dot.
+ * Return -1 if address starts, but get error.
+ * Return 0 if no address.
+ */
+int
+parseaddr(char *line, char **eptr)
+{
+ int delim, plen;
+ ulong v;
+ char *e;
+ char pat[Plen];
+
+ if(*line == '/' || *line == '!') {
+ /* anchored regular expression match; '!' means no folding */
+ if(*line == '/') {
+ delim = '/';
+ e = strpbrk(line+1, "/\n");
+ } else {
+ delim = '!';
+ e = strpbrk(line+1, "!\n");
+ }
+ plen = e-line-1;
+ if(plen >= Plen-3) {
+ err("pattern too big");
+ return -1;
+ }
+ pat[0] = '^';
+ memcpy(pat+1, line+1, plen);
+ pat[plen+1] = '$';
+ pat[plen+2] = 0;
+ if(*e == '\n')
+ line = e;
+ else
+ line = e+1;
+ if(!search(pat, delim == '/')) {
+ err("pattern not found");
+ return -1;
+ }
+ } else if(*line == '#') {
+ /* absolute byte offset into dictionary */
+ line++;
+ if(!isdigit(*line))
+ return -1;
+ v = strtoul(line, &e, 10);
+ line = e;
+ dot->doff[0] = v;
+ dot->n = 1;
+ dot->cur = 0;
+ } else if(isdigit(*line)) {
+ v = strtoul(line, &e, 10);
+ line = e;
+ if(v < 1 || v > dot->n)
+ err(".%d not in range [1,%d], ignored",
+ v, dot->n);
+ else
+ dot->cur = v-1;
+ } else if(*line == '.') {
+ line++;
+ } else {
+ *eptr = line;
+ return 0;
+ }
+ while(*line == '+' || *line == '-') {
+ if(*line == '+')
+ setdotnext();
+ else
+ setdotprev();
+ line++;
+ }
+ *eptr = line;
+ return 1;
+}
+
+/*
+ * Index file is sorted by folded field1.
+ * Method: find pre, a folded prefix of r.e. pat,
+ * and then low = offset to beginning of
+ * line in index file where first match of prefix occurs.
+ * Then go through index until prefix no longer matches,
+ * adding each line that matches real pattern to dot.
+ * Finally, sort dot offsets (uniquing).
+ * We know pat len < Plen, and that it is surrounded by ^..$
+ */
+int
+search(char *pat, int dofold)
+{
+ int needre, prelen, match, n;
+ Reprog *re;
+ long ioff, v;
+ Rune pre[Plen];
+ Rune lit[Plen];
+ Rune entry[Fieldlen];
+ char fpat[Plen];
+
+ prelen = getpref(pat+1, pre);
+ if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
+ runescpy(lit, pre);
+ if(dofold)
+ fold(lit);
+ needre = 0;
+ SET(re);
+ } else {
+ needre = 1;
+ if(dofold) {
+ foldre(fpat, pat);
+ re = regcomp(fpat);
+ } else
+ re = regcomp(pat);
+ }
+ fold(pre);
+ ioff = locate(pre);
+ if(ioff < 0)
+ return 0;
+ dot->n = 0;
+ Bseek(bindex, ioff, 0);
+ for(;;) {
+ if(!getfield(entry))
+ break;
+ if(dofold)
+ fold(entry);
+ if(needre)
+ match = rregexec(re, entry, 0, 0);
+ else
+ match = (acomp(lit, entry) == 0);
+ if(match) {
+ if(!getfield(entry))
+ break;
+ v = runetol(entry);
+ if(dot->n >= dot->maxn) {
+ n = 2*dot->maxn;
+ dot = realloc(dot,
+ sizeof(Addr)+(n-1)*sizeof(long));
+ if(!dot) {
+ err("out of memory");
+ exits("nomem");
+ }
+ dot->maxn = n;
+ }
+ dot->doff[dot->n++] = v;
+ } else {
+ if(!dofold)
+ fold(entry);
+ if(*pre) {
+ n = acomp(pre, entry);
+ if(n < -1 || (!needre && n < 0))
+ break;
+ }
+ /* get to next index entry */
+ if(!getfield(entry))
+ break;
+ }
+ }
+ sortaddr(dot);
+ dot->cur = 0;
+ return dot->n;
+}
+
+/*
+ * Return offset in index file of first line whose folded
+ * first field has pre as a prefix. -1 if none found.
+ */
+long
+locate(Rune *pre)
+{
+ long top, bot, mid;
+ Rune entry[Fieldlen];
+
+ if(*pre == 0)
+ return 0;
+ bot = 0;
+ top = indextop;
+ if(debug>1)
+ fprint(2, "locate looking for prefix %S\n", pre);
+ for(;;) {
+ /*
+ * Loop invariant: foldkey(bot) < pre <= foldkey(top)
+ * and bot < top, and bot,top point at beginning of lines
+ */
+ mid = (top+bot) / 2;
+ mid = seeknextline(bindex, mid);
+ if(debug > 1)
+ fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
+ bot, (top+bot) / 2, mid, top);
+ if(mid == top || !getfield(entry))
+ break;
+ if(debug > 1)
+ fprint(2, "key=%S\n", entry);
+ /*
+ * here mid is strictly between bot and top
+ */
+ fold(entry);
+ if(acomp(pre, entry) <= 0)
+ top = mid;
+ else
+ bot = mid;
+ }
+ /*
+ * bot < top, but they don't necessarily point at successive lines
+ * Use linear search from bot to find first line that pre is a
+ * prefix of
+ */
+ while((bot = seeknextline(bindex, bot)) <= top) {
+ if(!getfield(entry))
+ return -1;
+ if(debug > 1)
+ fprint(2, "key=%S\n", entry);
+ fold(entry);
+ switch(acomp(pre, entry)) {
+ case -2:
+ return -1;
+ case -1:
+ case 0:
+ return bot;
+ case 1:
+ case 2:
+ continue;
+ }
+ }
+ return -1;
+
+}
+
+/*
+ * Get prefix of non re-metacharacters, runified, into pre,
+ * and return length
+ */
+int
+getpref(char *pat, Rune *pre)
+{
+ int n, r;
+ char *p;
+
+ p = pat;
+ while(*p) {
+ n = chartorune(pre, p);
+ r = *pre;
+ switch(r) {
+ case 0x2e: case 0x2a: case 0x2b: case 0x3f:
+ case 0x5b: case 0x5d: case 0x28: case ')':
+ case 0x7c: case 0x5e: case 0x24:
+ *pre = 0;
+ return p-pat;
+ case L'\\':
+ p += n;
+ p += chartorune(++pre, p);
+ pre++;
+ break;
+ default:
+ p += n;
+ pre++;
+ }
+ }
+ return p-pat;
+}
+
+long
+seeknextline(Biobuf *b, long off)
+{
+ long c;
+
+ Bseek(b, off, 0);
+ do {
+ c = Bgetrune(b);
+ } while(c>=0 && c!='\n');
+ return Boffset(b);
+}
+
+/*
+ * Get next field out of index file (either tab- or nl- terminated)
+ * Answer in *rp, assumed to be Fieldlen long.
+ * Return 0 if read error first.
+ */
+int
+getfield(Rune *rp)
+{
+ long c;
+ int n;
+
+ for(n=Fieldlen; n-- > 0; ) {
+ if ((c = Bgetrune(bindex)) < 0)
+ return 0;
+ if(c == '\t' || c == '\n') {
+ *rp = L'\0';
+ return 1;
+ }
+ *rp++ = c;
+ }
+ err("word too long");
+ return 0;
+}
+
+/*
+ * A compare longs function suitable for qsort
+ */
+static int
+longcmp(const void *av, const void *bv)
+{
+ long v;
+ long *a, *b;
+
+ a = (long*)av;
+ b = (long*)bv;
+
+ v = *a - *b;
+ if(v < 0)
+ return -1;
+ else if(v == 0)
+ return 0;
+ else
+ return 1;
+}
+
+void
+sortaddr(Addr *a)
+{
+ int i, j;
+ long v;
+
+ if(a->n <= 1)
+ return;
+
+ qsort(a->doff, a->n, sizeof(long), longcmp);
+
+ /* remove duplicates */
+ for(i=0, j=0; j < a->n; j++) {
+ v = a->doff[j];
+ if(i > 0 && v == a->doff[i-1])
+ continue;
+ a->doff[i++] = v;
+ }
+ a->n = i;
+}
+
+Entry
+getentry(int i)
+{
+ long b, e, n;
+ static Entry ans;
+ static int anslen = 0;
+
+ b = dot->doff[i];
+ e = (*dict->nextoff)(b+1);
+ ans.doff = b;
+ if(e < 0) {
+ err("couldn't seek to entry");
+ ans.start = 0;
+ ans.end = 0;
+ } else {
+ n = e-b;
+ if(n+1 > anslen) {
+ ans.start = realloc(ans.start, n+1);
+ if(!ans.start) {
+ err("out of memory");
+ exits("nomem");
+ }
+ anslen = n+1;
+ }
+ Bseek(bdict, b, 0);
+ n = Bread(bdict, ans.start, n);
+ ans.end = ans.start + n;
+ *ans.end = 0;
+ }
+ return ans;
+}
+
+void
+setdotnext(void)
+{
+ long b;
+
+ b = (*dict->nextoff)(dot->doff[dot->cur]+1);
+ if(b < 0) {
+ err("couldn't find a next entry");
+ return;
+ }
+ dot->doff[0] = b;
+ dot->n = 1;
+ dot->cur = 0;
+}
+
+void
+setdotprev(void)
+{
+ int tryback;
+ long here, last, p;
+
+ if(dot->cur < 0 || dot->cur >= dot->n)
+ return;
+ tryback = 2000;
+ here = dot->doff[dot->cur];
+ last = 0;
+ while(last == 0) {
+ p = here - tryback;
+ if(p < 0)
+ p = 0;
+ for(;;) {
+ p = (*dict->nextoff)(p+1);
+ if(p < 0)
+ return; /* shouldn't happen */
+ if(p >= here)
+ break;
+ last = p;
+ }
+ if(!last) {
+ if(here - tryback < 0) {
+ err("can't find a previous entry");
+ return;
+ }
+ tryback = 2*tryback;
+ }
+ }
+ dot->doff[0] = last;
+ dot->n = 1;
+ dot->cur = 0;
+}