aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/upas/bayes/msgtok.c
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2005-10-29 16:26:32 +0000
committerrsc <devnull@localhost>2005-10-29 16:26:32 +0000
commitd1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch)
treea4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/msgtok.c
parent9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff)
downloadplan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz
plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.bz2
plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/msgtok.c')
-rw-r--r--src/cmd/upas/bayes/msgtok.c245
1 files changed, 245 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/msgtok.c b/src/cmd/upas/bayes/msgtok.c
new file mode 100644
index 00000000..7c450546
--- /dev/null
+++ b/src/cmd/upas/bayes/msgtok.c
@@ -0,0 +1,245 @@
+/*
+ * RFC822 message tokenizer (really feature generator) for spam filter.
+ *
+ * See Paul Graham's musings on spam filtering for theory.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include <ctype.h>
+#include "dfa.h"
+
+void buildre(Dreprog*[3]);
+int debug;
+char *refile = "/mail/lib/classify.re";
+int maxtoklen = 20;
+int trim(char*);
+
+void
+usage(void)
+{
+ fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
+ exits("usage");
+}
+
+void
+main(int argc, char **argv)
+{
+ int i, hdr, n, eof, off;
+ Dreprog *re[3];
+ int m[3];
+ char *p, *ep, *tag;
+ Biobuf bout, bin;
+ char msg[1024+1];
+ char buf[1024];
+
+ buildre(re);
+ ARGBEGIN{
+ case 'D':
+ debug = 1;
+ break;
+ case 'n':
+ maxtoklen = atoi(EARGF(usage()));
+ break;
+ case 'r':
+ refile = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND;
+
+ if(argc > 1)
+ usage();
+ if(argc == 1){
+ close(0);
+ if(open(argv[0], OREAD) < 0)
+ sysfatal("open %s: %r", argv[0]);
+ }
+
+ tag = nil;
+ Binit(&bin, 0, OREAD);
+ Binit(&bout, 1, OWRITE);
+ ep = msg;
+ p = msg;
+ eof = 0;
+ off = 0;
+ hdr = 1;
+ for(;;){
+ /* replenish buffer */
+ if(ep - p < 512 && !eof){
+ if(p > msg + 1){
+ n = ep - p;
+ memmove(msg, p-1, ep-(p-1));
+ off += (p-1) - msg;
+ p = msg+1;
+ ep = p + n;
+ }
+ n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
+ if(n < 0)
+ sysfatal("read error: %r");
+ if(n == 0)
+ eof = 1;
+ ep += n;
+ *ep = 0;
+ }
+ if(p >= ep)
+ break;
+
+ if(*p == 0){
+ p++;
+ continue;
+ }
+
+ if(hdr && p[-1]=='\n'){
+ if(p[0]=='\n')
+ hdr = 0;
+ else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
+ tag = "From*";
+ else if(cistrncmp(p-1, "\nto:", 4) == 0)
+ tag = "To*";
+ else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
+ tag = "Subject*";
+ else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
+ tag = "Return-Path*";
+ else
+ tag = nil;
+ }
+ m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
+ m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
+ m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
+
+ n = m[0];
+ if(n < m[1])
+ n = m[1];
+ if(n < m[2])
+ n = m[2];
+ if(n <= 0){
+fprint(2, "«%s» %.2ux", p, p[0]);
+ sysfatal("no regexps matched at %ld", off + (p-msg));
+ }
+
+ if(m[0] >= m[1] && m[0] >= m[2]){
+ /* "From " marks start of new message */
+ Bprint(&bout, "*From*\n");
+ n = m[0];
+ hdr = 1;
+ }else if(m[2] > 1){
+ /* ignore */
+ n = m[2];
+ }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
+ /* keyword */
+ /* should do UTF-aware lowercasing, too much bother */
+/*
+ for(i=0; i<n; i++)
+ if('A' <= p[i] && p[i] <= 'Z')
+ p[i] += 'a' - 'A';
+*/
+ if(tag){
+ i = strlen(tag);
+ memmove(buf, tag, i);
+ memmove(buf+i, p, m[1]);
+ buf[i+m[1]] = 0;
+ }else{
+ memmove(buf, p, m[1]);
+ buf[m[1]] = 0;
+ }
+ Bprint(&bout, "%s\n", buf);
+ while(trim(buf) >= 0)
+ Bprint(&bout, "stem*%s\n", buf);
+ n = m[1];
+ }else
+ n = m[2];
+ if(debug)
+ fprint(2, "%.*s¦", utfnlen(p, n), p);
+ p += n;
+ }
+ Bterm(&bout);
+ exits(0);
+}
+
+void
+buildre(Dreprog *re[3])
+{
+ Biobuf *b;
+
+ if((b = Bopen(refile, OREAD)) == nil)
+ sysfatal("open %s: %r", refile);
+
+ re[0] = Breaddfa(b);
+ re[1] = Breaddfa(b);
+ re[2] = Breaddfa(b);
+
+ if(re[0]==nil || re[1]==nil || re[2]==nil)
+ sysfatal("Breaddfa: %r");
+ Bterm(b);
+}
+
+/* perhaps this belongs in the tokenizer */
+int
+trim(char *s)
+{
+ char *p, *op;
+ int mix, mix1;
+
+ if(*s == '*')
+ return -1;
+
+ /* strip leading punctuation */
+ p = strchr(s, '*');
+ if(p == nil)
+ p = s;
+ while(*p && !isalpha(*p))
+ p++;
+ if(strlen(p) < 2)
+{
+ return -1;
+}
+ memmove(s, p, strlen(p)+1);
+
+ /* strip suffix of punctuation */
+ p = s+strlen(s);
+ op = p;
+ while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
+ p--;
+
+ /* chop punctuation */
+ if(p > s){
+ /* free!!! -> free! */
+ if(p+1 < op){
+ p[1] = 0;
+ return 0;
+ }
+ /* free! -> free */
+ if(p < op){
+ p[0] = 0;
+ return 0;
+ }
+ }
+
+ mix = mix1 = 0;
+ if(isupper(s[0]))
+ mix = 1;
+ for(p=s+1; *p; p++)
+ if(isupper(*p)){
+ mix1 = 1;
+ break;
+ }
+
+ /* turn FREE into Free */
+ if(mix1){
+ for(p=s+1; *p; p++)
+ if(isupper(*p))
+ *p += 'a'-'A';
+ return 0;
+ }
+
+ /* turn Free into free */
+ if(mix){
+ *s += 'a'-'A';
+ return 0;
+ }
+ return -1;
+}
+