diff options
author | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
commit | d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch) | |
tree | a4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/bayes.c | |
parent | 9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff) | |
download | plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.bz2 plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip |
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/bayes.c')
-rw-r--r-- | src/cmd/upas/bayes/bayes.c | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/bayes.c b/src/cmd/upas/bayes/bayes.c new file mode 100644 index 00000000..a0404290 --- /dev/null +++ b/src/cmd/upas/bayes/bayes.c @@ -0,0 +1,232 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <regexp.h> +#include "hash.h" + +enum +{ + MAXTAB = 256, + MAXBEST = 32, +}; + +typedef struct Table Table; +struct Table +{ + char *file; + Hash *hash; + int nmsg; +}; + +typedef struct Word Word; +struct Word +{ + Stringtab *s; /* from hmsg */ + int count[MAXTAB]; /* counts from each table */ + double p[MAXTAB]; /* probabilities from each table */ + double mp; /* max probability */ + int mi; /* w.p[w.mi] = w.mp */ +}; + +Table tab[MAXTAB]; +int ntab; + +Word best[MAXBEST]; +int mbest; +int nbest; + +int debug; + +void +usage(void) +{ + fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n"); + exits("usage"); +} + +void* +emalloc(int n) +{ + void *v; + + v = mallocz(n, 1); + if(v == nil) + sysfatal("out of memory"); + return v; +} + +void +noteword(Word *w) +{ + int i; + + for(i=nbest-1; i>=0; i--) + if(w->mp < best[i].mp) + break; + i++; + + if(i >= mbest) + return; + if(nbest == mbest) + nbest--; + if(i < nbest) + memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); + best[i] = *w; + nbest++; +} + +Hash* +hread(char *s) +{ + Hash *h; + Biobuf *b; + + if((b = Bopenlock(s, OREAD)) == nil) + sysfatal("open %s: %r", s); + + h = emalloc(sizeof(Hash)); + Breadhash(b, h, 1); + Bterm(b); + return h; +} + +void +main(int argc, char **argv) +{ + int i, j, a, mi, oi, tot, keywords; + double totp, p, xp[MAXTAB]; + Hash *hmsg; + Word w; + Stringtab *s, *t; + Biobuf bout; + + mbest = 15; + keywords = 0; + ARGBEGIN{ + case 'D': + debug = 1; + break; + case 'k': + keywords = 1; + break; + case 'm': + mbest = atoi(EARGF(usage())); + if(mbest > MAXBEST) + sysfatal("cannot keep more than %d words", MAXBEST); + break; + default: + usage(); + }ARGEND + + for(i=0; i<argc; i++) + if(strcmp(argv[i], "~") == 0) + break; + + if(i > MAXTAB) + sysfatal("cannot handle more than %d tables", MAXTAB); + + if(i+1 >= argc) + usage(); + + for(i=0; i<argc; i++){ + if(strcmp(argv[i], "~") == 0) + break; + tab[ntab].file = argv[i]; + tab[ntab].hash = hread(argv[i]); + s = findstab(tab[ntab].hash, "*nmsg*", 6, 1); + if(s == nil || s->count == 0) + tab[ntab].nmsg = 1; + else + tab[ntab].nmsg = s->count; + ntab++; + } + + Binit(&bout, 1, OWRITE); + + oi = ++i; + for(a=i; a<argc; a++){ + hmsg = hread(argv[a]); + nbest = 0; + for(s=hmsg->all; s; s=s->link){ + w.s = s; + tot = 0; + totp = 0.0; + for(i=0; i<ntab; i++){ + t = findstab(tab[i].hash, s->str, s->n, 0); + if(t == nil) + w.count[i] = 0; + else + w.count[i] = t->count; + tot += w.count[i]; + p = w.count[i]/(double)tab[i].nmsg; + if(p >= 1.0) + p = 1.0; + w.p[i] = p; + totp += p; + } + + if(tot < 5){ /* word does not appear enough; give to box 0 */ + w.p[0] = 0.5; + for(i=1; i<ntab; i++) + w.p[i] = 0.1; + w.mp = 0.5; + w.mi = 0; + noteword(&w); + continue; + } + + w.mp = 0.0; + for(i=0; i<ntab; i++){ + p = w.p[i]; + p /= totp; + if(p < 0.01) + p = 0.01; + else if(p > 0.99) + p = 0.99; + if(p > w.mp){ + w.mp = p; + w.mi = i; + } + w.p[i] = p; + } + noteword(&w); + } + + totp = 0.0; + for(i=0; i<ntab; i++){ + p = 1.0; + for(j=0; j<nbest; j++) + p *= best[j].p[i]; + xp[i] = p; + totp += p; + } + for(i=0; i<ntab; i++) + xp[i] /= totp; + mi = 0; + for(i=1; i<ntab; i++) + if(xp[i] > xp[mi]) + mi = i; + if(oi != argc-1) + Bprint(&bout, "%s: ", argv[a]); + Bprint(&bout, "%s %f", tab[mi].file, xp[mi]); + if(keywords){ + for(i=0; i<nbest; i++){ + Bprint(&bout, " "); + Bwrite(&bout, best[i].s->str, best[i].s->n); + Bprint(&bout, " %f", best[i].p[mi]); + } + } + freehash(hmsg); + Bprint(&bout, "\n"); + if(debug){ + for(i=0; i<nbest; i++){ + Bwrite(&bout, best[i].s->str, best[i].s->n); + Bprint(&bout, " %f", best[i].p[mi]); + if(best[i].p[mi] < best[i].mp) + Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file); + Bprint(&bout, "\n"); + } + } + } + Bterm(&bout); +} |