aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/upas/bayes/bayes.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/upas/bayes/bayes.c')
-rw-r--r--src/cmd/upas/bayes/bayes.c232
1 files changed, 232 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/bayes.c b/src/cmd/upas/bayes/bayes.c
new file mode 100644
index 00000000..a0404290
--- /dev/null
+++ b/src/cmd/upas/bayes/bayes.c
@@ -0,0 +1,232 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include "hash.h"
+
+enum
+{
+ MAXTAB = 256,
+ MAXBEST = 32,
+};
+
+typedef struct Table Table;
+struct Table
+{
+ char *file;
+ Hash *hash;
+ int nmsg;
+};
+
+typedef struct Word Word;
+struct Word
+{
+ Stringtab *s; /* from hmsg */
+ int count[MAXTAB]; /* counts from each table */
+ double p[MAXTAB]; /* probabilities from each table */
+ double mp; /* max probability */
+ int mi; /* w.p[w.mi] = w.mp */
+};
+
+Table tab[MAXTAB];
+int ntab;
+
+Word best[MAXBEST];
+int mbest;
+int nbest;
+
+int debug;
+
+void
+usage(void)
+{
+ fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");
+ exits("usage");
+}
+
+void*
+emalloc(int n)
+{
+ void *v;
+
+ v = mallocz(n, 1);
+ if(v == nil)
+ sysfatal("out of memory");
+ return v;
+}
+
+void
+noteword(Word *w)
+{
+ int i;
+
+ for(i=nbest-1; i>=0; i--)
+ if(w->mp < best[i].mp)
+ break;
+ i++;
+
+ if(i >= mbest)
+ return;
+ if(nbest == mbest)
+ nbest--;
+ if(i < nbest)
+ memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
+ best[i] = *w;
+ nbest++;
+}
+
+Hash*
+hread(char *s)
+{
+ Hash *h;
+ Biobuf *b;
+
+ if((b = Bopenlock(s, OREAD)) == nil)
+ sysfatal("open %s: %r", s);
+
+ h = emalloc(sizeof(Hash));
+ Breadhash(b, h, 1);
+ Bterm(b);
+ return h;
+}
+
+void
+main(int argc, char **argv)
+{
+ int i, j, a, mi, oi, tot, keywords;
+ double totp, p, xp[MAXTAB];
+ Hash *hmsg;
+ Word w;
+ Stringtab *s, *t;
+ Biobuf bout;
+
+ mbest = 15;
+ keywords = 0;
+ ARGBEGIN{
+ case 'D':
+ debug = 1;
+ break;
+ case 'k':
+ keywords = 1;
+ break;
+ case 'm':
+ mbest = atoi(EARGF(usage()));
+ if(mbest > MAXBEST)
+ sysfatal("cannot keep more than %d words", MAXBEST);
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ for(i=0; i<argc; i++)
+ if(strcmp(argv[i], "~") == 0)
+ break;
+
+ if(i > MAXTAB)
+ sysfatal("cannot handle more than %d tables", MAXTAB);
+
+ if(i+1 >= argc)
+ usage();
+
+ for(i=0; i<argc; i++){
+ if(strcmp(argv[i], "~") == 0)
+ break;
+ tab[ntab].file = argv[i];
+ tab[ntab].hash = hread(argv[i]);
+ s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);
+ if(s == nil || s->count == 0)
+ tab[ntab].nmsg = 1;
+ else
+ tab[ntab].nmsg = s->count;
+ ntab++;
+ }
+
+ Binit(&bout, 1, OWRITE);
+
+ oi = ++i;
+ for(a=i; a<argc; a++){
+ hmsg = hread(argv[a]);
+ nbest = 0;
+ for(s=hmsg->all; s; s=s->link){
+ w.s = s;
+ tot = 0;
+ totp = 0.0;
+ for(i=0; i<ntab; i++){
+ t = findstab(tab[i].hash, s->str, s->n, 0);
+ if(t == nil)
+ w.count[i] = 0;
+ else
+ w.count[i] = t->count;
+ tot += w.count[i];
+ p = w.count[i]/(double)tab[i].nmsg;
+ if(p >= 1.0)
+ p = 1.0;
+ w.p[i] = p;
+ totp += p;
+ }
+
+ if(tot < 5){ /* word does not appear enough; give to box 0 */
+ w.p[0] = 0.5;
+ for(i=1; i<ntab; i++)
+ w.p[i] = 0.1;
+ w.mp = 0.5;
+ w.mi = 0;
+ noteword(&w);
+ continue;
+ }
+
+ w.mp = 0.0;
+ for(i=0; i<ntab; i++){
+ p = w.p[i];
+ p /= totp;
+ if(p < 0.01)
+ p = 0.01;
+ else if(p > 0.99)
+ p = 0.99;
+ if(p > w.mp){
+ w.mp = p;
+ w.mi = i;
+ }
+ w.p[i] = p;
+ }
+ noteword(&w);
+ }
+
+ totp = 0.0;
+ for(i=0; i<ntab; i++){
+ p = 1.0;
+ for(j=0; j<nbest; j++)
+ p *= best[j].p[i];
+ xp[i] = p;
+ totp += p;
+ }
+ for(i=0; i<ntab; i++)
+ xp[i] /= totp;
+ mi = 0;
+ for(i=1; i<ntab; i++)
+ if(xp[i] > xp[mi])
+ mi = i;
+ if(oi != argc-1)
+ Bprint(&bout, "%s: ", argv[a]);
+ Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);
+ if(keywords){
+ for(i=0; i<nbest; i++){
+ Bprint(&bout, " ");
+ Bwrite(&bout, best[i].s->str, best[i].s->n);
+ Bprint(&bout, " %f", best[i].p[mi]);
+ }
+ }
+ freehash(hmsg);
+ Bprint(&bout, "\n");
+ if(debug){
+ for(i=0; i<nbest; i++){
+ Bwrite(&bout, best[i].s->str, best[i].s->n);
+ Bprint(&bout, " %f", best[i].p[mi]);
+ if(best[i].p[mi] < best[i].mp)
+ Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);
+ Bprint(&bout, "\n");
+ }
+ }
+ }
+ Bterm(&bout);
+}