aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/upas/bayes/msgclass.c
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2005-10-29 16:26:32 +0000
committerrsc <devnull@localhost>2005-10-29 16:26:32 +0000
commitd1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch)
treea4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/msgclass.c
parent9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff)
downloadplan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz
plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.bz2
plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/msgclass.c')
-rw-r--r--src/cmd/upas/bayes/msgclass.c296
1 files changed, 296 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/msgclass.c b/src/cmd/upas/bayes/msgclass.c
new file mode 100644
index 00000000..bd3c571c
--- /dev/null
+++ b/src/cmd/upas/bayes/msgclass.c
@@ -0,0 +1,296 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "msgdb.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
+ exits("usage");
+}
+
+enum
+{
+ MAXBEST = 32,
+ MAXLEN = 64,
+ MAXTAB = 256,
+};
+
+typedef struct Ndb Ndb;
+struct Ndb
+{
+ char *name;
+ char *file;
+ Msgdb *db;
+ double p;
+ long nmsg;
+};
+
+typedef struct Word Word;
+struct Word
+{
+ char s[MAXLEN];
+ int count[MAXTAB];
+ double p[MAXTAB];
+ double mp;
+ int mi; /* w.p[w.mi] = w.mp */
+ int nmsg;
+};
+
+Ndb db[MAXTAB];
+int ndb;
+
+int add;
+int mul;
+Msgdb *indb;
+
+Word best[MAXBEST];
+int mbest = 15;
+int nbest;
+
+void process(Biobuf*, char*);
+void lockfile(char*);
+
+void
+noteword(Word *w, char *s)
+{
+ int i;
+
+ for(i=nbest-1; i>=0; i--)
+ if(w->mp < best[i].mp)
+ break;
+ i++;
+
+ if(i >= mbest)
+ return;
+ if(nbest == mbest)
+ nbest--;
+ if(i < nbest)
+ memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
+ best[i] = *w;
+ strecpy(best[i].s, best[i].s+MAXLEN, s);
+ nbest++;
+}
+
+void
+main(int argc, char **argv)
+{
+ int i, bad, m, tot, nn, j;
+ Biobuf bin, *b, bout;
+ char *s, *lf;
+ double totp, p, thresh;
+ long n;
+ Word w;
+
+ lf = nil;
+ thresh = 0;
+ ARGBEGIN{
+ case 'a':
+ add = 1;
+ break;
+ case 'd':
+ if(ndb >= MAXTAB)
+ sysfatal("too many db classes");
+ db[ndb].name = EARGF(usage());
+ db[ndb].file = EARGF(usage());
+ ndb++;
+ break;
+ case 'l':
+ lf = EARGF(usage());
+ break;
+ case 'm':
+ mul = atoi(EARGF(usage()));
+ break;
+ case 't':
+ thresh = atof(EARGF(usage()));
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(ndb == 0){
+ fprint(2, "must have at least one -d option\n");
+ usage();
+ }
+
+ indb = mdopen(nil, 1);
+ if(argc == 0){
+ Binit(&bin, 0, OREAD);
+ process(&bin, "<stdin>");
+ Bterm(&bin);
+ }else{
+ bad = 0;
+ for(i=0; i<argc; i++){
+ if((b = Bopen(argv[i], OREAD)) == nil){
+ fprint(2, "opening %s: %r\n", argv[i]);
+ bad = 1;
+ continue;
+ }
+ process(b, argv[i]);
+ Bterm(b);
+ }
+ if(bad)
+ exits("open inputs");
+ }
+
+ lockfile(lf);
+ bad = 0;
+ for(i=0; i<ndb; i++){
+ if((db[i].db = mdopen(db[i].file, 0)) == nil){
+ fprint(2, "opendb %s: %r\n", db[i].file);
+ bad = 1;
+ }
+ db[i].nmsg = mdget(db[i].db, "*From*");
+ }
+ if(bad)
+ exits("open databases");
+
+ /* run conditional probabilities of input words, getting 15 most specific */
+ mdenum(indb);
+ nbest = 0;
+ while(mdnext(indb, &s, &n) >= 0){
+ tot = 0;
+ totp = 0.0;
+ for(i=0; i<ndb; i++){
+ nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
+ tot += nn;
+ w.count[i] = nn;
+ p = w.count[i]/(double)db[i].nmsg;
+ if(p >= 1.0)
+ p = 1.0;
+ w.p[i] = p;
+ totp += p;
+ }
+//fprint(2, "%s tot %d totp %g\n", s, tot, totp);
+ if(tot < 2)
+ continue;
+ w.mp = 0.0;
+ for(i=0; i<ndb; i++){
+ p = w.p[i];
+ p /= totp;
+ if(p < 0.001)
+ p = 0.001;
+ else if(p > 0.999)
+ p = 0.999;
+ if(p > w.mp){
+ w.mp = p;
+ w.mi = i;
+ }
+ w.p[i] = p;
+ }
+ noteword(&w, s);
+ }
+
+ /* compute conditional probabilities of message classes using 15 most specific */
+ totp = 0.0;
+ for(i=0; i<ndb; i++){
+ p = 1.0;
+ for(j=0; j<nbest; j++)
+ p *= best[j].p[i];
+ db[i].p = p;
+ totp += p;
+ }
+ for(i=0; i<ndb; i++)
+ db[i].p /= totp;
+ m = 0;
+ for(i=1; i<ndb; i++)
+ if(db[i].p > db[m].p)
+ m = i;
+
+ Binit(&bout, 1, OWRITE);
+ if(db[m].p < thresh)
+ m = -1;
+ if(m >= 0)
+ Bprint(&bout, "%s", db[m].name);
+ else
+ Bprint(&bout, "inconclusive");
+ for(j=0; j<ndb; j++)
+ Bprint(&bout, " %s=%g", db[j].name, db[j].p);
+ Bprint(&bout, "\n");
+ for(i=0; i<nbest; i++){
+ Bprint(&bout, "%s", best[i].s);
+ for(j=0; j<ndb; j++)
+ Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
+ Bprint(&bout, "\n");
+ }
+ Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
+ Bterm(&bout);
+
+ if(m >= 0 && add){
+ mdenum(indb);
+ while(mdnext(indb, &s, &n) >= 0)
+ mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
+ mdclose(db[m].db);
+ }
+ exits(nil);
+}
+
+void
+process(Biobuf *b, char*)
+{
+ char *s;
+ char *p;
+ long n;
+
+ while((s = Brdline(b, '\n')) != nil){
+ s[Blinelen(b)-1] = 0;
+ if((p = strrchr(s, ' ')) != nil){
+ *p++ = 0;
+ n = atoi(p);
+ }else
+ n = 1;
+ mdput(indb, s, mdget(indb, s)+n);
+ }
+}
+
+int tpid;
+void
+killtickle(void)
+{
+ postnote(PNPROC, tpid, "die");
+}
+
+void
+lockfile(char *s)
+{
+ int fd, t, w;
+ char err[ERRMAX];
+
+ if(s == nil)
+ return;
+ w = 50;
+ t = 0;
+ for(;;){
+ fd = open(s, OREAD);
+ if(fd >= 0)
+ break;
+ rerrstr(err, sizeof err);
+ if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
+ break;
+ sleep(w);
+ t += w;
+ if(w < 1000)
+ w = (w*3)/2;
+ if(t > 120*1000)
+ break;
+ }
+ if(fd < 0)
+ sysfatal("could not lock %s", s);
+ switch(tpid = fork()){
+ case -1:
+ sysfatal("fork: %r");
+ case 0:
+ for(;;){
+ sleep(30*1000);
+ free(dirfstat(fd));
+ }
+ _exits(nil);
+ default:
+ break;
+ }
+ close(fd);
+ atexit(killtickle);
+}
+