From d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 Mon Sep 17 00:00:00 2001 From: rsc Date: Sat, 29 Oct 2005 16:26:32 +0000 Subject: Thanks to John Cummings. --- src/cmd/upas/bayes/msgtok.c | 245 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 src/cmd/upas/bayes/msgtok.c (limited to 'src/cmd/upas/bayes/msgtok.c') diff --git a/src/cmd/upas/bayes/msgtok.c b/src/cmd/upas/bayes/msgtok.c new file mode 100644 index 00000000..7c450546 --- /dev/null +++ b/src/cmd/upas/bayes/msgtok.c @@ -0,0 +1,245 @@ +/* + * RFC822 message tokenizer (really feature generator) for spam filter. + * + * See Paul Graham's musings on spam filtering for theory. + */ + +#include +#include +#include +#include +#include +#include "dfa.h" + +void buildre(Dreprog*[3]); +int debug; +char *refile = "/mail/lib/classify.re"; +int maxtoklen = 20; +int trim(char*); + +void +usage(void) +{ + fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n"); + exits("usage"); +} + +void +main(int argc, char **argv) +{ + int i, hdr, n, eof, off; + Dreprog *re[3]; + int m[3]; + char *p, *ep, *tag; + Biobuf bout, bin; + char msg[1024+1]; + char buf[1024]; + + buildre(re); + ARGBEGIN{ + case 'D': + debug = 1; + break; + case 'n': + maxtoklen = atoi(EARGF(usage())); + break; + case 'r': + refile = EARGF(usage()); + break; + default: + usage(); + }ARGEND; + + if(argc > 1) + usage(); + if(argc == 1){ + close(0); + if(open(argv[0], OREAD) < 0) + sysfatal("open %s: %r", argv[0]); + } + + tag = nil; + Binit(&bin, 0, OREAD); + Binit(&bout, 1, OWRITE); + ep = msg; + p = msg; + eof = 0; + off = 0; + hdr = 1; + for(;;){ + /* replenish buffer */ + if(ep - p < 512 && !eof){ + if(p > msg + 1){ + n = ep - p; + memmove(msg, p-1, ep-(p-1)); + off += (p-1) - msg; + p = msg+1; + ep = p + n; + } + n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep); + if(n < 0) + sysfatal("read error: %r"); + if(n == 0) + eof = 1; + ep += n; + *ep = 0; + } + if(p >= ep) + break; + + if(*p == 0){ + p++; + continue; + } + + if(hdr && p[-1]=='\n'){ + if(p[0]=='\n') + hdr = 0; + else if(cistrncmp(p-1, "\nfrom:", 6) == 0) + tag = "From*"; + else if(cistrncmp(p-1, "\nto:", 4) == 0) + tag = "To*"; + else if(cistrncmp(p-1, "\nsubject:", 9) == 0) + tag = "Subject*"; + else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0) + tag = "Return-Path*"; + else + tag = nil; + } + m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n'); + m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n'); + m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n'); + + n = m[0]; + if(n < m[1]) + n = m[1]; + if(n < m[2]) + n = m[2]; + if(n <= 0){ +fprint(2, "«%s» %.2ux", p, p[0]); + sysfatal("no regexps matched at %ld", off + (p-msg)); + } + + if(m[0] >= m[1] && m[0] >= m[2]){ + /* "From " marks start of new message */ + Bprint(&bout, "*From*\n"); + n = m[0]; + hdr = 1; + }else if(m[2] > 1){ + /* ignore */ + n = m[2]; + }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){ + /* keyword */ + /* should do UTF-aware lowercasing, too much bother */ +/* + for(i=0; i= 0) + Bprint(&bout, "stem*%s\n", buf); + n = m[1]; + }else + n = m[2]; + if(debug) + fprint(2, "%.*s¦", utfnlen(p, n), p); + p += n; + } + Bterm(&bout); + exits(0); +} + +void +buildre(Dreprog *re[3]) +{ + Biobuf *b; + + if((b = Bopen(refile, OREAD)) == nil) + sysfatal("open %s: %r", refile); + + re[0] = Breaddfa(b); + re[1] = Breaddfa(b); + re[2] = Breaddfa(b); + + if(re[0]==nil || re[1]==nil || re[2]==nil) + sysfatal("Breaddfa: %r"); + Bterm(b); +} + +/* perhaps this belongs in the tokenizer */ +int +trim(char *s) +{ + char *p, *op; + int mix, mix1; + + if(*s == '*') + return -1; + + /* strip leading punctuation */ + p = strchr(s, '*'); + if(p == nil) + p = s; + while(*p && !isalpha(*p)) + p++; + if(strlen(p) < 2) +{ + return -1; +} + memmove(s, p, strlen(p)+1); + + /* strip suffix of punctuation */ + p = s+strlen(s); + op = p; + while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1])) + p--; + + /* chop punctuation */ + if(p > s){ + /* free!!! -> free! */ + if(p+1 < op){ + p[1] = 0; + return 0; + } + /* free! -> free */ + if(p < op){ + p[0] = 0; + return 0; + } + } + + mix = mix1 = 0; + if(isupper(s[0])) + mix = 1; + for(p=s+1; *p; p++) + if(isupper(*p)){ + mix1 = 1; + break; + } + + /* turn FREE into Free */ + if(mix1){ + for(p=s+1; *p; p++) + if(isupper(*p)) + *p += 'a'-'A'; + return 0; + } + + /* turn Free into free */ + if(mix){ + *s += 'a'-'A'; + return 0; + } + return -1; +} + -- cgit v1.2.3