From d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 Mon Sep 17 00:00:00 2001 From: rsc Date: Sat, 29 Oct 2005 16:26:32 +0000 Subject: Thanks to John Cummings. --- src/cmd/upas/bayes/msgclass.c | 296 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 src/cmd/upas/bayes/msgclass.c (limited to 'src/cmd/upas/bayes/msgclass.c') diff --git a/src/cmd/upas/bayes/msgclass.c b/src/cmd/upas/bayes/msgclass.c new file mode 100644 index 00000000..bd3c571c --- /dev/null +++ b/src/cmd/upas/bayes/msgclass.c @@ -0,0 +1,296 @@ +#include +#include +#include +#include +#include "msgdb.h" + +void +usage(void) +{ + fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n"); + exits("usage"); +} + +enum +{ + MAXBEST = 32, + MAXLEN = 64, + MAXTAB = 256, +}; + +typedef struct Ndb Ndb; +struct Ndb +{ + char *name; + char *file; + Msgdb *db; + double p; + long nmsg; +}; + +typedef struct Word Word; +struct Word +{ + char s[MAXLEN]; + int count[MAXTAB]; + double p[MAXTAB]; + double mp; + int mi; /* w.p[w.mi] = w.mp */ + int nmsg; +}; + +Ndb db[MAXTAB]; +int ndb; + +int add; +int mul; +Msgdb *indb; + +Word best[MAXBEST]; +int mbest = 15; +int nbest; + +void process(Biobuf*, char*); +void lockfile(char*); + +void +noteword(Word *w, char *s) +{ + int i; + + for(i=nbest-1; i>=0; i--) + if(w->mp < best[i].mp) + break; + i++; + + if(i >= mbest) + return; + if(nbest == mbest) + nbest--; + if(i < nbest) + memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); + best[i] = *w; + strecpy(best[i].s, best[i].s+MAXLEN, s); + nbest++; +} + +void +main(int argc, char **argv) +{ + int i, bad, m, tot, nn, j; + Biobuf bin, *b, bout; + char *s, *lf; + double totp, p, thresh; + long n; + Word w; + + lf = nil; + thresh = 0; + ARGBEGIN{ + case 'a': + add = 1; + break; + case 'd': + if(ndb >= MAXTAB) + sysfatal("too many db classes"); + db[ndb].name = EARGF(usage()); + db[ndb].file = EARGF(usage()); + ndb++; + break; + case 'l': + lf = EARGF(usage()); + break; + case 'm': + mul = atoi(EARGF(usage())); + break; + case 't': + thresh = atof(EARGF(usage())); + break; + default: + usage(); + }ARGEND + + if(ndb == 0){ + fprint(2, "must have at least one -d option\n"); + usage(); + } + + indb = mdopen(nil, 1); + if(argc == 0){ + Binit(&bin, 0, OREAD); + process(&bin, ""); + Bterm(&bin); + }else{ + bad = 0; + for(i=0; i= 0){ + tot = 0; + totp = 0.0; + for(i=0; i= 1.0) + p = 1.0; + w.p[i] = p; + totp += p; + } +//fprint(2, "%s tot %d totp %g\n", s, tot, totp); + if(tot < 2) + continue; + w.mp = 0.0; + for(i=0; i 0.999) + p = 0.999; + if(p > w.mp){ + w.mp = p; + w.mi = i; + } + w.p[i] = p; + } + noteword(&w, s); + } + + /* compute conditional probabilities of message classes using 15 most specific */ + totp = 0.0; + for(i=0; i db[m].p) + m = i; + + Binit(&bout, 1, OWRITE); + if(db[m].p < thresh) + m = -1; + if(m >= 0) + Bprint(&bout, "%s", db[m].name); + else + Bprint(&bout, "inconclusive"); + for(j=0; j= 0 && add){ + mdenum(indb); + while(mdnext(indb, &s, &n) >= 0) + mdput(db[m].db, s, mdget(db[m].db, s)+n*mul); + mdclose(db[m].db); + } + exits(nil); +} + +void +process(Biobuf *b, char*) +{ + char *s; + char *p; + long n; + + while((s = Brdline(b, '\n')) != nil){ + s[Blinelen(b)-1] = 0; + if((p = strrchr(s, ' ')) != nil){ + *p++ = 0; + n = atoi(p); + }else + n = 1; + mdput(indb, s, mdget(indb, s)+n); + } +} + +int tpid; +void +killtickle(void) +{ + postnote(PNPROC, tpid, "die"); +} + +void +lockfile(char *s) +{ + int fd, t, w; + char err[ERRMAX]; + + if(s == nil) + return; + w = 50; + t = 0; + for(;;){ + fd = open(s, OREAD); + if(fd >= 0) + break; + rerrstr(err, sizeof err); + if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil)) + break; + sleep(w); + t += w; + if(w < 1000) + w = (w*3)/2; + if(t > 120*1000) + break; + } + if(fd < 0) + sysfatal("could not lock %s", s); + switch(tpid = fork()){ + case -1: + sysfatal("fork: %r"); + case 0: + for(;;){ + sleep(30*1000); + free(dirfstat(fd)); + } + _exits(nil); + default: + break; + } + close(fd); + atexit(killtickle); +} + -- cgit v1.2.3