diff options
author | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
commit | d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch) | |
tree | a4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/msgclass.c | |
parent | 9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff) | |
download | plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.bz2 plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip |
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/msgclass.c')
-rw-r--r-- | src/cmd/upas/bayes/msgclass.c | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/msgclass.c b/src/cmd/upas/bayes/msgclass.c new file mode 100644 index 00000000..bd3c571c --- /dev/null +++ b/src/cmd/upas/bayes/msgclass.c @@ -0,0 +1,296 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <ctype.h> +#include "msgdb.h" + +void +usage(void) +{ + fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n"); + exits("usage"); +} + +enum +{ + MAXBEST = 32, + MAXLEN = 64, + MAXTAB = 256, +}; + +typedef struct Ndb Ndb; +struct Ndb +{ + char *name; + char *file; + Msgdb *db; + double p; + long nmsg; +}; + +typedef struct Word Word; +struct Word +{ + char s[MAXLEN]; + int count[MAXTAB]; + double p[MAXTAB]; + double mp; + int mi; /* w.p[w.mi] = w.mp */ + int nmsg; +}; + +Ndb db[MAXTAB]; +int ndb; + +int add; +int mul; +Msgdb *indb; + +Word best[MAXBEST]; +int mbest = 15; +int nbest; + +void process(Biobuf*, char*); +void lockfile(char*); + +void +noteword(Word *w, char *s) +{ + int i; + + for(i=nbest-1; i>=0; i--) + if(w->mp < best[i].mp) + break; + i++; + + if(i >= mbest) + return; + if(nbest == mbest) + nbest--; + if(i < nbest) + memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); + best[i] = *w; + strecpy(best[i].s, best[i].s+MAXLEN, s); + nbest++; +} + +void +main(int argc, char **argv) +{ + int i, bad, m, tot, nn, j; + Biobuf bin, *b, bout; + char *s, *lf; + double totp, p, thresh; + long n; + Word w; + + lf = nil; + thresh = 0; + ARGBEGIN{ + case 'a': + add = 1; + break; + case 'd': + if(ndb >= MAXTAB) + sysfatal("too many db classes"); + db[ndb].name = EARGF(usage()); + db[ndb].file = EARGF(usage()); + ndb++; + break; + case 'l': + lf = EARGF(usage()); + break; + case 'm': + mul = atoi(EARGF(usage())); + break; + case 't': + thresh = atof(EARGF(usage())); + break; + default: + usage(); + }ARGEND + + if(ndb == 0){ + fprint(2, "must have at least one -d option\n"); + usage(); + } + + indb = mdopen(nil, 1); + if(argc == 0){ + Binit(&bin, 0, OREAD); + process(&bin, "<stdin>"); + Bterm(&bin); + }else{ + bad = 0; + for(i=0; i<argc; i++){ + if((b = Bopen(argv[i], OREAD)) == nil){ + fprint(2, "opening %s: %r\n", argv[i]); + bad = 1; + continue; + } + process(b, argv[i]); + Bterm(b); + } + if(bad) + exits("open inputs"); + } + + lockfile(lf); + bad = 0; + for(i=0; i<ndb; i++){ + if((db[i].db = mdopen(db[i].file, 0)) == nil){ + fprint(2, "opendb %s: %r\n", db[i].file); + bad = 1; + } + db[i].nmsg = mdget(db[i].db, "*From*"); + } + if(bad) + exits("open databases"); + + /* run conditional probabilities of input words, getting 15 most specific */ + mdenum(indb); + nbest = 0; + while(mdnext(indb, &s, &n) >= 0){ + tot = 0; + totp = 0.0; + for(i=0; i<ndb; i++){ + nn = mdget(db[i].db, s)*(i==0 ? 3 : 1); + tot += nn; + w.count[i] = nn; + p = w.count[i]/(double)db[i].nmsg; + if(p >= 1.0) + p = 1.0; + w.p[i] = p; + totp += p; + } +//fprint(2, "%s tot %d totp %g\n", s, tot, totp); + if(tot < 2) + continue; + w.mp = 0.0; + for(i=0; i<ndb; i++){ + p = w.p[i]; + p /= totp; + if(p < 0.001) + p = 0.001; + else if(p > 0.999) + p = 0.999; + if(p > w.mp){ + w.mp = p; + w.mi = i; + } + w.p[i] = p; + } + noteword(&w, s); + } + + /* compute conditional probabilities of message classes using 15 most specific */ + totp = 0.0; + for(i=0; i<ndb; i++){ + p = 1.0; + for(j=0; j<nbest; j++) + p *= best[j].p[i]; + db[i].p = p; + totp += p; + } + for(i=0; i<ndb; i++) + db[i].p /= totp; + m = 0; + for(i=1; i<ndb; i++) + if(db[i].p > db[m].p) + m = i; + + Binit(&bout, 1, OWRITE); + if(db[m].p < thresh) + m = -1; + if(m >= 0) + Bprint(&bout, "%s", db[m].name); + else + Bprint(&bout, "inconclusive"); + for(j=0; j<ndb; j++) + Bprint(&bout, " %s=%g", db[j].name, db[j].p); + Bprint(&bout, "\n"); + for(i=0; i<nbest; i++){ + Bprint(&bout, "%s", best[i].s); + for(j=0; j<ndb; j++) + Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]); + Bprint(&bout, "\n"); + } + Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]); + Bterm(&bout); + + if(m >= 0 && add){ + mdenum(indb); + while(mdnext(indb, &s, &n) >= 0) + mdput(db[m].db, s, mdget(db[m].db, s)+n*mul); + mdclose(db[m].db); + } + exits(nil); +} + +void +process(Biobuf *b, char*) +{ + char *s; + char *p; + long n; + + while((s = Brdline(b, '\n')) != nil){ + s[Blinelen(b)-1] = 0; + if((p = strrchr(s, ' ')) != nil){ + *p++ = 0; + n = atoi(p); + }else + n = 1; + mdput(indb, s, mdget(indb, s)+n); + } +} + +int tpid; +void +killtickle(void) +{ + postnote(PNPROC, tpid, "die"); +} + +void +lockfile(char *s) +{ + int fd, t, w; + char err[ERRMAX]; + + if(s == nil) + return; + w = 50; + t = 0; + for(;;){ + fd = open(s, OREAD); + if(fd >= 0) + break; + rerrstr(err, sizeof err); + if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil)) + break; + sleep(w); + t += w; + if(w < 1000) + w = (w*3)/2; + if(t > 120*1000) + break; + } + if(fd < 0) + sysfatal("could not lock %s", s); + switch(tpid = fork()){ + case -1: + sysfatal("fork: %r"); + case 0: + for(;;){ + sleep(30*1000); + free(dirfstat(fd)); + } + _exits(nil); + default: + break; + } + close(fd); + atexit(killtickle); +} + |