diff options
author | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2005-10-29 16:26:32 +0000 |
commit | d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34 (patch) | |
tree | a4d6f28106cca984926b9dd5ecddd6053b654617 /src/cmd/upas/bayes/regen.c | |
parent | 9f1fdc128738b2ed76258ac22a8574c681f3df3a (diff) | |
download | plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.gz plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.tar.bz2 plan9port-d1f529f46f957c78a3db73b42c2fcd2d3c9f8a34.zip |
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/bayes/regen.c')
-rw-r--r-- | src/cmd/upas/bayes/regen.c | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/regen.c b/src/cmd/upas/bayes/regen.c new file mode 100644 index 00000000..4f550095 --- /dev/null +++ b/src/cmd/upas/bayes/regen.c @@ -0,0 +1,176 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <regexp.h> +#include "dfa.h" + +/*** + * Regular expression for matching. + */ + +char *ignore[] = +{ + /* HTML that isn't A, IMG, or FONT */ + /* Must have a space somewhere to avoid catching <email@address> */ + "<[ \n\r]*(" + "[^aif]|" + "a[^> \t\r\n]|" + "i[^mM \t\r\n]|" + "im[^gG \t\r\n]|" + "img[^> \t\r\n]|" + "f[^oO \t\r\n]|" + "fo[^Nn \t\r\n]|" + "fon[^tT \t\r\n]|" + "font[^> \r\t\n]" + ")[^>]*[ \t\n\r][^>]*>", + "<[ \n\r]*(" + "i|im|f|fo|fon" + ")[ \t\r\n][^>]*>", + + /* ignore html comments */ + "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->", + + /* random mail strings */ + "^message-id:.*\n([ ].*\n)*", + "^in-reply-to:.*\n([ ].*\n)*", + "^references:.*\n([ ].*\n)*", + "^date:.*\n([ ].*\n)*", + "^delivery-date:.*\n([ ].*\n)*", + "e?smtp id .*", + "^ id.*", + "boundary=.*", + "name=\"", + "filename=\"", + "news:<[^>]+>", + "^--[^ ]*$", + + /* base64 encoding */ + "^[0-9a-zA-Z+\\-=/]+$", + + /* uu encoding */ + "^[!-Z]+$", + + /* little things */ + ".", + "\n", +}; + +char *keywords[] = +{ + "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+", +}; + +int debug; + +Dreprog* +dregcomp(char *buf) +{ + Reprog *r; + Dreprog *d; + + if(debug) + print(">>> '%s'\n", buf); + + r = regcomp(buf); + if(r == nil) + sysfatal("regcomp"); + d = dregcvt(r); + if(d == nil) + sysfatal("dregcomp"); + free(r); + return d; +} + +char* +strcpycase(char *d, char *s) +{ + int cc, esc; + + cc = 0; + esc = 0; + while(*s){ + if(*s == '[') + cc++; + if(*s == ']') + cc--; + if(!cc && 'a' <= *s && *s <= 'z'){ + *d++ = '['; + *d++ = *s; + *d++ = *s+'A'-'a'; + *d++ = ']'; + }else + *d++ = *s; + if(*s == '\\') + esc++; + else if(esc) + esc--; + s++; + } + return d; +} + +void +regerror(char *msg) +{ + sysfatal("regerror: %s", msg); +} + +void +buildre(Dreprog *re[3]) +{ + int i; + static char buf[16384], *s; + + re[0] = dregcomp("^From "); + + s = buf; + for(i=0; i<nelem(keywords); i++){ + if(i != 0) + *s++ = '|'; + s = strcpycase(s, keywords[i]); + } + *s = 0; + re[1] = dregcomp(buf); + + s = buf; + for(i=0; i<nelem(ignore); i++){ + if(i != 0) + *s++ = '|'; + s = strcpycase(s, ignore[i]); + } + *s = 0; + re[2] = dregcomp(buf); +} + +void +usage(void) +{ + fprint(2, "usage: regen [-d]\n"); + exits("usage"); +} + +void +main(int argc, char **argv) +{ + Dreprog *re[3]; + Biobuf b; + + ARGBEGIN{ + default: + usage(); + case 'd': + debug = 1; + }ARGEND + + if(argc != 0) + usage(); + + buildre(re); + Binit(&b, 1, OWRITE); + Bprintdfa(&b, re[0]); + Bprintdfa(&b, re[1]); + Bprintdfa(&b, re[2]); + exits(0); +} + +
\ No newline at end of file |