aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/upas/scanmail
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2005-10-29 16:26:44 +0000
committerrsc <devnull@localhost>2005-10-29 16:26:44 +0000
commit5cdb17983ae6e6367ad7a940cb219eab247a9304 (patch)
tree8ca1ef49af2a96e7daebe624d91fdf679814a057 /src/cmd/upas/scanmail
parentcd3745196389579fb78b9b01ef1daefb5a57aa71 (diff)
downloadplan9port-5cdb17983ae6e6367ad7a940cb219eab247a9304.tar.gz
plan9port-5cdb17983ae6e6367ad7a940cb219eab247a9304.tar.bz2
plan9port-5cdb17983ae6e6367ad7a940cb219eab247a9304.zip
Thanks to John Cummings.
Diffstat (limited to 'src/cmd/upas/scanmail')
-rw-r--r--src/cmd/upas/scanmail/common.c667
-rw-r--r--src/cmd/upas/scanmail/mkfile24
-rw-r--r--src/cmd/upas/scanmail/scanmail.c476
-rw-r--r--src/cmd/upas/scanmail/spam.h62
-rw-r--r--src/cmd/upas/scanmail/testscan.c212
5 files changed, 1441 insertions, 0 deletions
diff --git a/src/cmd/upas/scanmail/common.c b/src/cmd/upas/scanmail/common.c
new file mode 100644
index 00000000..b6ea720d
--- /dev/null
+++ b/src/cmd/upas/scanmail/common.c
@@ -0,0 +1,667 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include "spam.h"
+
+enum {
+ Quanta = 8192,
+ Minbody = 6000,
+ HdrMax = 15,
+};
+
+typedef struct keyword Keyword;
+typedef struct word Word;
+
+struct word{
+ char *string;
+ int n;
+};
+
+struct keyword{
+ char *string;
+ int value;
+};
+
+Word htmlcmds[] =
+{
+ "html", 4,
+ "!doctype html", 13,
+ 0,
+
+};
+
+Word hrefs[] =
+{
+ "a href=", 7,
+ "a title=", 8,
+ "a target=", 9,
+ "base href=", 10,
+ "img src=", 8,
+ "img border=", 11,
+ "form action=", 12,
+ "!--", 3,
+ 0,
+
+};
+
+/*
+ * RFC822 header keywords to look for for fractured header.
+ * all lengths must be less than HdrMax defined above.
+ */
+Word hdrwords[] =
+{
+ "cc:", 3,
+ "bcc:", 4,
+ "to:", 3,
+ 0, 0,
+
+};
+
+Keyword keywords[] =
+{
+ "header", HoldHeader,
+ "line", SaveLine,
+ "hold", Hold,
+ "dump", Dump,
+ "loff", Lineoff,
+ 0, Nactions,
+};
+
+Patterns patterns[] = {
+[Dump] { "DUMP:", 0, 0 },
+[HoldHeader] { "HEADER:", 0, 0 },
+[Hold] { "HOLD:", 0, 0 },
+[SaveLine] { "LINE:", 0, 0 },
+[Lineoff] { "LINEOFF:", 0, 0 },
+[Nactions] { 0, 0, 0 },
+};
+
+static char* endofhdr(char*, char*);
+static int escape(char**);
+static int extract(char*);
+static int findkey(char*);
+static int hash(int);
+static int isword(Word*, char*, int);
+static void parsealt(Biobuf*, char*, Spat**);
+
+/*
+ * The canonicalizer: convert input to canonical representation
+ */
+char*
+readmsg(Biobuf *bp, int *hsize, int *bufsize)
+{
+ char *p, *buf;
+ int n, offset, eoh, bsize, delta;
+
+ buf = 0;
+ offset = 0;
+ if(bufsize)
+ *bufsize = 0;
+ if(hsize)
+ *hsize = 0;
+ for(;;) {
+ buf = Realloc(buf, offset+Quanta+1);
+ n = Bread(bp, buf+offset, Quanta);
+ if(n < 0){
+ free(buf);
+ return 0;
+ }
+ p = buf+offset; /* start of this chunk */
+ offset += n; /* end of this chunk */
+ buf[offset] = 0;
+ if(n == 0){
+ if(offset == 0)
+ return 0;
+ break;
+ }
+
+ if(hsize == 0) /* don't process header */
+ break;
+ if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
+ p--;
+ p = endofhdr(p, buf+offset);
+ if(p)
+ break;
+ if(offset >= Maxread) /* gargantuan header - just punt*/
+ {
+ if(hsize)
+ *hsize = offset;
+ if(bufsize)
+ *bufsize = offset;
+ return buf;
+ }
+ }
+ eoh = p-buf; /* End of header */
+ bsize = offset - eoh; /* amount of body already read */
+
+ /* Read at least Minbody bytes of the body */
+ if (bsize < Minbody){
+ delta = Minbody-bsize;
+ buf = Realloc(buf, offset+delta+1);
+ n = Bread(bp, buf+offset, delta);
+ if(n > 0) {
+ offset += n;
+ buf[offset] = 0;
+ }
+ }
+ if(hsize)
+ *hsize = eoh;
+ if(bufsize)
+ *bufsize = offset;
+ return buf;
+}
+
+static int
+isword(Word *wp, char *text, int len)
+{
+ for(;wp->string; wp++)
+ if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
+ return 1;
+ return 0;
+}
+
+static char*
+endofhdr(char *raw, char *end)
+{
+ int i;
+ char *p, *q;
+ char buf[HdrMax];
+
+ /*
+ * can't use strchr to search for newlines because
+ * there may be embedded NULL's.
+ */
+ for(p = raw; p < end; p++){
+ if(*p != '\n' || p[1] != '\n')
+ continue;
+ p++;
+ for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
+ buf[i++] = tolower(*q);
+ if(*q == ':' || *q == '\n')
+ break;
+ }
+ if(!isword(hdrwords, buf, i))
+ return p+1;
+ }
+ return 0;
+}
+
+static int
+htmlmatch(Word *wp, char *text, char *end, int *n)
+{
+ char *cp;
+ int i, c, lastc;
+ char buf[MaxHtml];
+
+ /*
+ * extract a string up to '>'
+ */
+
+ i = lastc = 0;
+ cp = text;
+ while (cp < end && i < sizeof(buf)-1){
+ c = *cp++;
+ if(c == '=')
+ c = escape(&cp);
+ switch(c){
+ case 0:
+ case '\r':
+ continue;
+ case '>':
+ goto out;
+ case '\n':
+ case ' ':
+ case '\t':
+ if(lastc == ' ')
+ continue;
+ c = ' ';
+ break;
+ default:
+ c = tolower(c);
+ break;
+ }
+ buf[i++] = lastc = c;
+ }
+out:
+ buf[i] = 0;
+ if(n)
+ *n = cp-text;
+ return isword(wp, buf, i);
+}
+
+static int
+escape(char **msg)
+{
+ int c;
+ char *p;
+
+ p = *msg;
+ c = *p;
+ if(c == '\n'){
+ p++;
+ c = *p++;
+ } else
+ if(c == '2'){
+ c = tolower(p[1]);
+ if(c == 'e'){
+ p += 2;
+ c = '.';
+ }else
+ if(c == 'f'){
+ p += 2;
+ c = '/';
+ }else
+ if(c == '0'){
+ p += 2;
+ c = ' ';
+ }
+ else c = '=';
+ } else {
+ if(c == '3' && tolower(p[1]) == 'd')
+ p += 2;
+ c = '=';
+ }
+ *msg = p;
+ return c;
+}
+
+static int
+htmlchk(char **msg, char *end)
+{
+ int n;
+ char *p;
+
+ static int ishtml;
+
+ p = *msg;
+ if(ishtml == 0){
+ ishtml = htmlmatch(htmlcmds, p, end, &n);
+
+ /* If not an HTML keyword, check if it's
+ * an HTML comment (<!comment>). if so,
+ * skip over it; otherwise copy it in.
+ */
+ if(ishtml == 0 && *p != '!') /* not comment */
+ return '<'; /* copy it */
+
+ } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
+ return '<'; /* copy it */
+
+ /*
+ * this is an uninteresting HTML command; skip over it.
+ */
+ p += n;
+ *msg = p+1;
+ return *p;
+}
+
+/*
+ * decode a base 64 encode body
+ */
+void
+conv64(char *msg, char *end, char *buf, int bufsize)
+{
+ int len, i;
+ char *cp;
+
+ len = end - msg;
+ i = (len*3)/4+1; // room for max chars + null
+ cp = Malloc(i);
+ len = dec64((uchar*)cp, i, msg, len);
+ convert(cp, cp+len, buf, bufsize, 1);
+ free(cp);
+}
+
+int
+convert(char *msg, char *end, char *buf, int bufsize, int isbody)
+{
+
+ char *p;
+ int c, lastc, base64;
+
+ lastc = 0;
+ base64 = 0;
+ while(msg < end && bufsize > 0){
+ c = *msg++;
+
+ /*
+ * In the body only, try to strip most HTML and
+ * replace certain MIME escape sequences with the character
+ */
+ if(isbody) {
+ do{
+ p = msg;
+ if(c == '<')
+ c = htmlchk(&msg, end);
+ if(c == '=')
+ c = escape(&msg);
+ } while(p != msg && p < end);
+ }
+ switch(c){
+ case 0:
+ case '\r':
+ continue;
+ case '\t':
+ case ' ':
+ case '\n':
+ if(lastc == ' ')
+ continue;
+ c = ' ';
+ break;
+ case 'C': /* check for MIME base 64 encoding in header */
+ case 'c':
+ if(isbody == 0)
+ if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
+ if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
+ base64 = 1;
+ c = 'c';
+ break;
+ default:
+ c = tolower(c);
+ break;
+ }
+ *buf++ = c;
+ lastc = c;
+ bufsize--;
+ }
+ *buf = 0;
+ return base64;
+}
+
+/*
+ * The pattern parser: build data structures from the pattern file
+ */
+
+static int
+hash(int c)
+{
+ return c & 127;
+}
+
+static int
+findkey(char *val)
+{
+ Keyword *kp;
+
+ for(kp = keywords; kp->string; kp++)
+ if(strcmp(val, kp->string) == 0)
+ break;
+ return kp->value;
+}
+
+#define whitespace(c) ((c) == ' ' || (c) == '\t')
+
+void
+parsepats(Biobuf *bp)
+{
+ Pattern *p, *new;
+ char *cp, *qp;
+ int type, action, n, h;
+ Spat *spat;
+
+ for(;;){
+ cp = Brdline(bp, '\n');
+ if(cp == 0)
+ break;
+ cp[Blinelen(bp)-1] = 0;
+ while(*cp == ' ' || *cp == '\t')
+ cp++;
+ if(*cp == '#' || *cp == 0)
+ continue;
+ type = regexp;
+ if(*cp == '*'){
+ type = string;
+ cp++;
+ }
+ qp = strchr(cp, ':');
+ if(qp == 0)
+ continue;
+ *qp = 0;
+ if(debug)
+ fprint(2, "action = %s\n", cp);
+ action = findkey(cp);
+ if(action >= Nactions)
+ continue;
+ cp = qp+1;
+ n = extract(cp);
+ if(n <= 0 || *cp == 0)
+ continue;
+
+ qp = strstr(cp, "~~");
+ if(qp){
+ *qp = 0;
+ n = strlen(cp);
+ }
+ if(debug)
+ fprint(2, " Pattern: `%s'\n", cp);
+
+ /* Hook regexps into a chain */
+ if(type == regexp) {
+ new = Malloc(sizeof(Pattern));
+ new->action = action;
+ new->pat = regcomp(cp);
+ if(new->pat == 0){
+ free(new);
+ continue;
+ }
+ new->type = regexp;
+ new->alt = 0;
+ new->next = 0;
+
+ if(qp)
+ parsealt(bp, qp+2, &new->alt);
+
+ new->next = patterns[action].regexps;
+ patterns[action].regexps = new;
+ continue;
+
+ }
+ /* not a Regexp - hook strings into Pattern hash chain */
+ spat = Malloc(sizeof(*spat));
+ spat->next = 0;
+ spat->alt = 0;
+ spat->len = n;
+ spat->string = Malloc(n+1);
+ spat->c1 = cp[1];
+ strcpy(spat->string, cp);
+
+ if(qp)
+ parsealt(bp, qp+2, &spat->alt);
+
+ p = patterns[action].strings;
+ if(p == 0) {
+ p = Malloc(sizeof(Pattern));
+ memset(p, 0, sizeof(*p));
+ p->action = action;
+ p->type = string;
+ patterns[action].strings = p;
+ }
+ h = hash(*spat->string);
+ spat->next = p->spat[h];
+ p->spat[h] = spat;
+ }
+}
+
+static void
+parsealt(Biobuf *bp, char *cp, Spat** head)
+{
+ char *p;
+ Spat *alt;
+
+ while(cp){
+ if(*cp == 0){ /*escaped newline*/
+ do{
+ cp = Brdline(bp, '\n');
+ if(cp == 0)
+ return;
+ cp[Blinelen(bp)-1] = 0;
+ } while(extract(cp) <= 0 || *cp == 0);
+ }
+
+ p = cp;
+ cp = strstr(p, "~~");
+ if(cp){
+ *cp = 0;
+ cp += 2;
+ }
+ if(strlen(p)){
+ alt = Malloc(sizeof(*alt));
+ alt->string = strdup(p);
+ alt->next = *head;
+ *head = alt;
+ }
+ }
+}
+
+static int
+extract(char *cp)
+{
+ int c;
+ char *p, *q, *r;
+
+ p = q = r = cp;
+ while(whitespace(*p))
+ p++;
+ while(c = *p++){
+ if (c == '#')
+ break;
+ if(c == '"'){
+ while(*p && *p != '"'){
+ if(*p == '\\' && p[1] == '"')
+ p++;
+ if('A' <= *p && *p <= 'Z')
+ *q++ = *p++ + ('a'-'A');
+ else
+ *q++ = *p++;
+ }
+ if(*p)
+ p++;
+ r = q; /* never back up over a quoted string */
+ } else {
+ if('A' <= c && c <= 'Z')
+ c += ('a'-'A');
+ *q++ = c;
+ }
+ }
+ while(q > r && whitespace(q[-1]))
+ q--;
+ *q = 0;
+ return q-cp;
+}
+
+/*
+ * The matching engine: compare canonical input to pattern structures
+ */
+
+static Spat*
+isalt(char *message, Spat *alt)
+{
+ while(alt) {
+ if(*cmd)
+ if(message != cmd && strstr(cmd, alt->string))
+ break;
+ if(message != header+1 && strstr(header+1, alt->string))
+ break;
+ if(strstr(message, alt->string))
+ break;
+ alt = alt->next;
+ }
+ return alt;
+}
+
+int
+matchpat(Pattern *p, char *message, Resub *m)
+{
+ Spat *spat;
+ char *s;
+ int c, c1;
+
+ if(p->type == string){
+ c1 = *message;
+ for(s=message; c=c1; s++){
+ c1 = s[1];
+ for(spat=p->spat[hash(c)]; spat; spat=spat->next){
+ if(c1 == spat->c1)
+ if(memcmp(s, spat->string, spat->len) == 0)
+ if(!isalt(message, spat->alt)){
+ m->sp = s;
+ m->ep = s + spat->len;
+ return 1;
+ }
+ }
+ }
+ return 0;
+ }
+ m->sp = m->ep = 0;
+ if(regexec(p->pat, message, m, 1) == 0)
+ return 0;
+ if(isalt(message, p->alt))
+ return 0;
+ return 1;
+}
+
+
+void
+xprint(int fd, char *type, Resub *m)
+{
+ char *p, *q;
+ int i;
+
+ if(m->sp == 0 || m->ep == 0)
+ return;
+
+ /* back up approx 30 characters to whitespace */
+ for(p = m->sp, i = 0; *p && i < 30; i++, p--)
+ ;
+ while(*p && *p != ' ')
+ p--;
+ p++;
+
+ /* grab about 30 more chars beyond the end of the match */
+ for(q = m->ep, i = 0; *q && i < 30; i++, q++)
+ ;
+ while(*q && *q != ' ')
+ q++;
+
+ fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
+}
+
+enum {
+ INVAL= 255
+};
+
+static uchar t64d[256] = {
+/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
+/*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14,
+/*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40,
+/*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+};
diff --git a/src/cmd/upas/scanmail/mkfile b/src/cmd/upas/scanmail/mkfile
new file mode 100644
index 00000000..5f0db855
--- /dev/null
+++ b/src/cmd/upas/scanmail/mkfile
@@ -0,0 +1,24 @@
+</$objtype/mkfile
+
+TARG=scanmail\
+ testscan
+
+OFILES= common.$O
+
+HFILES= spam.h\
+ ../common/sys.h\
+
+LIB= ../common/libcommon.a$O\
+
+BIN=/$objtype/bin/upas
+UPDATE=\
+ mkfile\
+ $HFILES\
+ ${OFILES:%.$O=%.c}\
+ ${TARG:%=%.c}\
+
+</sys/src/cmd/mkmany
+CFLAGS=$CFLAGS -I../common
+
+scanmail.$O: scanmail.c
+ $CC $CFLAGS -D'SPOOL="/mail"' scanmail.c
diff --git a/src/cmd/upas/scanmail/scanmail.c b/src/cmd/upas/scanmail/scanmail.c
new file mode 100644
index 00000000..444bbcdd
--- /dev/null
+++ b/src/cmd/upas/scanmail/scanmail.c
@@ -0,0 +1,476 @@
+#include "common.h"
+#include "spam.h"
+
+int cflag;
+int debug;
+int hflag;
+int nflag;
+int sflag;
+int tflag;
+int vflag;
+Biobuf bin, bout, *cout;
+
+ /* file names */
+char patfile[128];
+char linefile[128];
+char holdqueue[128];
+char copydir[128];
+
+char header[Hdrsize+2];
+char cmd[1024];
+char **qname;
+char **qdir;
+char *sender;
+String *recips;
+
+char* canon(Biobuf*, char*, char*, int*);
+int matcher(char*, Pattern*, char*, Resub*);
+int matchaction(int, char*, Resub*);
+Biobuf *opencopy(char*);
+Biobuf *opendump(char*);
+char *qmail(char**, char*, int, Biobuf*);
+void saveline(char*, char*, Resub*);
+int optoutofspamfilter(char*);
+
+void
+usage(void)
+{
+ fprint(2, "missing or bad arguments to qer\n");
+ exits("usage");
+}
+
+void
+regerror(char *s)
+{
+ fprint(2, "scanmail: %s\n", s);
+}
+
+void *
+Malloc(long n)
+{
+ void *p;
+
+ p = malloc(n);
+ if(p == 0)
+ exits("malloc");
+ return p;
+}
+
+void*
+Realloc(void *p, ulong n)
+{
+ p = realloc(p, n);
+ if(p == 0)
+ exits("realloc");
+ return p;
+}
+
+void
+main(int argc, char *argv[])
+{
+ int i, n, nolines, optout;
+ char **args, **a, *cp, *buf;
+ char body[Bodysize+2];
+ Resub match[1];
+ Biobuf *bp;
+
+ optout = 1;
+ a = args = Malloc((argc+1)*sizeof(char*));
+ sprint(patfile, "%s/patterns", UPASLIB);
+ sprint(linefile, "%s/lines", UPASLOG);
+ sprint(holdqueue, "%s/queue.hold", SPOOL);
+ sprint(copydir, "%s/copy", SPOOL);
+
+ *a++ = argv[0];
+ for(argc--, argv++; argv[0] && argv[0][0] == '-'; argc--, argv++){
+ switch(argv[0][1]){
+ case 'c': /* save copy of message */
+ cflag = 1;
+ break;
+ case 'd': /* debug */
+ debug++;
+ *a++ = argv[0];
+ break;
+ case 'h': /* queue held messages by sender domain */
+ hflag = 1; /* -q flag must be set also */
+ break;
+ case 'n': /* NOHOLD mode */
+ nflag = 1;
+ break;
+ case 'p': /* pattern file */
+ if(argv[0][2] || argv[1] == 0)
+ usage();
+ argc--;
+ argv++;
+ strecpy(patfile, patfile+sizeof patfile, *argv);
+ break;
+ case 'q': /* queue name */
+ if(argv[0][2] || argv[1] == 0)
+ usage();
+ *a++ = argv[0];
+ argc--;
+ argv++;
+ qname = a;
+ *a++ = argv[0];
+ break;
+ case 's': /* save copy of dumped message */
+ sflag = 1;
+ break;
+ case 't': /* test mode - don't log match
+ * and write message to /dev/null
+ */
+ tflag = 1;
+ break;
+ case 'v': /* vebose - print matches */
+ vflag = 1;
+ break;
+ default:
+ *a++ = argv[0];
+ break;
+ }
+ }
+
+ if(argc < 3)
+ usage();
+
+ Binit(&bin, 0, OREAD);
+ bp = Bopen(patfile, OREAD);
+ if(bp){
+ parsepats(bp);
+ Bterm(bp);
+ }
+ qdir = a;
+ sender = argv[2];
+
+ /* copy the rest of argv, acummulating the recipients as we go */
+ for(i = 0; argv[i]; i++){
+ *a++ = argv[i];
+ if(i < 4) /* skip queue, 'mail', sender, dest sys */
+ continue;
+ /* recipients and smtp flags - skip the latter*/
+ if(strcmp(argv[i], "-g") == 0){
+ *a++ = argv[++i];
+ continue;
+ }
+ if(recips)
+ s_append(recips, ", ");
+ else
+ recips = s_new();
+ s_append(recips, argv[i]);
+ if(optout && !optoutofspamfilter(argv[i]))
+ optout = 0;
+ }
+ *a = 0;
+ /* construct a command string for matching */
+ snprint(cmd, sizeof(cmd)-1, "%s %s", sender, s_to_c(recips));
+ cmd[sizeof(cmd)-1] = 0;
+ for(cp = cmd; *cp; cp++)
+ *cp = tolower(*cp);
+
+ /* canonicalize a copy of the header and body.
+ * buf points to orginal message and n contains
+ * number of bytes of original message read during
+ * canonicalization.
+ */
+ *body = 0;
+ *header = 0;
+ buf = canon(&bin, header+1, body+1, &n);
+ if (buf == 0)
+ exits("read");
+
+ /* if all users opt out, don't try matches */
+ if(optout){
+ if(cflag)
+ cout = opencopy(sender);
+ exits(qmail(args, buf, n, cout));
+ }
+
+ /* Turn off line logging, if command line matches */
+ nolines = matchaction(Lineoff, cmd, match);
+
+ for(i = 0; patterns[i].action; i++){
+ /* Lineoff patterns were already done above */
+ if(i == Lineoff)
+ continue;
+ /* don't apply "Line" patterns if excluded above */
+ if(nolines && i == SaveLine)
+ continue;
+ /* apply patterns to the sender/recips, header and body */
+ if(matchaction(i, cmd, match))
+ break;
+ if(matchaction(i, header+1, match))
+ break;
+ if(i == HoldHeader)
+ continue;
+ if(matchaction(i, body+1, match))
+ break;
+ }
+ if(cflag && patterns[i].action == 0) /* no match found - save msg */
+ cout = opencopy(sender);
+
+ exits(qmail(args, buf, n, cout));
+}
+
+char*
+qmail(char **argv, char *buf, int n, Biobuf *cout)
+{
+ Waitmsg *status;
+ int i, pid, pipefd[2];
+ char path[512];
+ Biobuf *bp;
+
+ pid = 0;
+ if(tflag == 0){
+ if(pipe(pipefd) < 0)
+ exits("pipe");
+ pid = fork();
+ if(pid == 0){
+ dup(pipefd[0], 0);
+ for(i = sysfiles(); i >= 3; i--)
+ close(i);
+ snprint(path, sizeof(path), "%s/qer", UPASBIN);
+ *argv=path;
+ exec(path, argv);
+ exits("exec");
+ }
+ Binit(&bout, pipefd[1], OWRITE);
+ bp = &bout;
+ } else
+ bp = Bopen("/dev/null", OWRITE);
+
+ while(n > 0){
+ Bwrite(bp, buf, n);
+ if(cout)
+ Bwrite(cout, buf, n);
+ n = Bread(&bin, buf, sizeof(buf)-1);
+ }
+ Bterm(bp);
+ if(cout)
+ Bterm(cout);
+ if(tflag)
+ return 0;
+
+ close(pipefd[1]);
+ close(pipefd[0]);
+ for(;;){
+ status = wait();
+ if(status == nil || status->pid == pid)
+ break;
+ free(status);
+ }
+ if(status == nil)
+ strcpy(buf, "wait failed");
+ else{
+ strcpy(buf, status->msg);
+ free(status);
+ }
+ return buf;
+}
+
+char*
+canon(Biobuf *bp, char *header, char *body, int *n)
+{
+ int hsize;
+ char *raw;
+
+ hsize = 0;
+ *header = 0;
+ *body = 0;
+ raw = readmsg(bp, &hsize, n);
+ if(raw){
+ if(convert(raw, raw+hsize, header, Hdrsize, 0))
+ conv64(raw+hsize, raw+*n, body, Bodysize); /* base64 */
+ else
+ convert(raw+hsize, raw+*n, body, Bodysize, 1); /* text */
+ }
+ return raw;
+}
+
+int
+matchaction(int action, char *message, Resub *m)
+{
+ char *name;
+ Pattern *p;
+
+ if(message == 0 || *message == 0)
+ return 0;
+
+ name = patterns[action].action;
+ p = patterns[action].strings;
+ if(p)
+ if(matcher(name, p, message, m))
+ return 1;
+
+ for(p = patterns[action].regexps; p; p = p->next)
+ if(matcher(name, p, message, m))
+ return 1;
+ return 0;
+}
+
+int
+matcher(char *action, Pattern *p, char *message, Resub *m)
+{
+ char *cp;
+ String *s;
+
+ for(cp = message; matchpat(p, cp, m); cp = m->ep){
+ switch(p->action){
+ case SaveLine:
+ if(vflag)
+ xprint(2, action, m);
+ saveline(linefile, sender, m);
+ break;
+ case HoldHeader:
+ case Hold:
+ if(nflag)
+ continue;
+ if(vflag)
+ xprint(2, action, m);
+ *qdir = holdqueue;
+ if(hflag && qname){
+ cp = strchr(sender, '!');
+ if(cp){
+ *cp = 0;
+ *qname = strdup(sender);
+ *cp = '!';
+ } else
+ *qname = strdup(sender);
+ }
+ return 1;
+ case Dump:
+ if(vflag)
+ xprint(2, action, m);
+ *(m->ep) = 0;
+ if(!tflag){
+ s = s_new();
+ s_append(s, sender);
+ s = unescapespecial(s);
+ syslog(0, "smtpd", "Dumped %s [%s] to %s", s_to_c(s), m->sp,
+ s_to_c(s_restart(recips)));
+ s_free(s);
+ }
+ tflag = 1;
+ if(sflag)
+ cout = opendump(sender);
+ return 1;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+void
+saveline(char *file, char *sender, Resub *rp)
+{
+ char *p, *q;
+ int i, c;
+ Biobuf *bp;
+
+ if(rp->sp == 0 || rp->ep == 0)
+ return;
+ /* back up approx 20 characters to whitespace */
+ for(p = rp->sp, i = 0; *p && i < 20; i++, p--)
+ ;
+ while(*p && *p != ' ')
+ p--;
+ p++;
+
+ /* grab about 20 more chars beyond the end of the match */
+ for(q = rp->ep, i = 0; *q && i < 20; i++, q++)
+ ;
+ while(*q && *q != ' ')
+ q++;
+
+ c = *q;
+ *q = 0;
+ bp = sysopen(file, "al", 0644);
+ if(bp){
+ Bprint(bp, "%s-> %s\n", sender, p);
+ Bterm(bp);
+ }
+ else if(debug)
+ fprint(2, "can't save line: (%s) %s\n", sender, p);
+ *q = c;
+}
+
+Biobuf*
+opendump(char *sender)
+{
+ int i;
+ ulong h;
+ char buf[512];
+ Biobuf *b;
+ char *cp;
+
+ cp = ctime(time(0));
+ cp[7] = 0;
+ cp[10] = 0;
+ if(cp[8] == ' ')
+ sprint(buf, "%s/queue.dump/%s%c", SPOOL, cp+4, cp[9]);
+ else
+ sprint(buf, "%s/queue.dump/%s%c%c", SPOOL, cp+4, cp[8], cp[9]);
+ cp = buf+strlen(buf);
+ if(access(buf, 0) < 0 && sysmkdir(buf, 0777) < 0){
+ syslog(0, "smtpd", "couldn't dump mail from %s: %r", sender);
+ return 0;
+ }
+
+ h = 0;
+ while(*sender)
+ h = h*257 + *sender++;
+ for(i = 0; i < 50; i++){
+ h += lrand();
+ sprint(cp, "/%lud", h);
+ b = sysopen(buf, "wlc", 0644);
+ if(b){
+ if(vflag)
+ fprint(2, "saving in %s\n", buf);
+ return b;
+ }
+ }
+ return 0;
+}
+
+Biobuf*
+opencopy(char *sender)
+{
+ int i;
+ ulong h;
+ char buf[512];
+ Biobuf *b;
+
+ h = 0;
+ while(*sender)
+ h = h*257 + *sender++;
+ for(i = 0; i < 50; i++){
+ h += lrand();
+ sprint(buf, "%s/%lud", copydir, h);
+ b = sysopen(buf, "wlc", 0600);
+ if(b)
+ return b;
+ }
+ return 0;
+}
+
+int
+optoutofspamfilter(char *addr)
+{
+ char *p, *f;
+ int rv;
+
+ p = strchr(addr, '!');
+ if(p)
+ p++;
+ else
+ p = addr;
+
+ rv = 0;
+ f = smprint("/mail/box/%s/nospamfiltering", p);
+ if(f != nil){
+ rv = access(f, 0)==0;
+ free(f);
+ }
+
+ return rv;
+}
diff --git a/src/cmd/upas/scanmail/spam.h b/src/cmd/upas/scanmail/spam.h
new file mode 100644
index 00000000..f1d24b2e
--- /dev/null
+++ b/src/cmd/upas/scanmail/spam.h
@@ -0,0 +1,62 @@
+
+enum{
+ Dump = 0, /* Actions must be in order of descending importance */
+ HoldHeader,
+ Hold,
+ SaveLine,
+ Lineoff, /* Lineoff must be the last action code */
+ Nactions,
+
+ Nhash = 128,
+
+ regexp = 1, /* types: literal string or regular expression */
+ string = 2,
+
+ MaxHtml = 256,
+ Hdrsize = 4096,
+ Bodysize = 8192,
+ Maxread = 64*1024,
+};
+
+typedef struct spat Spat;
+typedef struct pattern Pattern;
+typedef struct patterns Patterns;
+struct spat
+{
+ char* string;
+ int len;
+ int c1;
+ Spat* next;
+ Spat* alt;
+};
+
+struct pattern{
+ struct pattern *next;
+ int action;
+ int type;
+ Spat* alt;
+ union{
+ Reprog* pat;
+ Spat* spat[Nhash];
+ };
+};
+
+struct patterns {
+ char *action;
+ Pattern *strings;
+ Pattern *regexps;
+};
+
+extern int debug;
+extern Patterns patterns[];
+extern char header[];
+extern char cmd[];
+
+extern void conv64(char*, char*, char*, int);
+extern int convert(char*, char*, char*, int, int);
+extern void* Malloc(long n);
+extern int matchpat(Pattern*, char*, Resub*);
+extern char* readmsg(Biobuf*, int*, int*);
+extern void parsepats(Biobuf*);
+extern void* Realloc(void*, ulong);
+extern void xprint(int, char*, Resub*);
diff --git a/src/cmd/upas/scanmail/testscan.c b/src/cmd/upas/scanmail/testscan.c
new file mode 100644
index 00000000..e5ea59ad
--- /dev/null
+++ b/src/cmd/upas/scanmail/testscan.c
@@ -0,0 +1,212 @@
+#include "sys.h"
+#include "spam.h"
+
+int debug;
+Biobuf bin;
+char patfile[128], header[Hdrsize+2];
+char cmd[1024];
+
+char* canon(Biobuf*, char*, char*, int*);
+int matcher(char *, Pattern*, char*, Resub*);
+int matchaction(Patterns*, char*);
+
+void
+usage(void)
+{
+ fprint(2, "missing or bad arguments to qer\n");
+ exits("usage");
+}
+
+void *
+Malloc(long n)
+{
+ void *p;
+
+ p = malloc(n);
+ if(p == 0){
+ fprint(2, "malloc error");
+ exits("malloc");
+ }
+ return p;
+}
+
+void*
+Realloc(void *p, ulong n)
+{
+ p = realloc(p, n);
+ if(p == 0){
+ fprint(2, "realloc error");
+ exits("realloc");
+ }
+ return p;
+}
+
+void
+dumppats(void)
+{
+ int i, j;
+ Pattern *p;
+ Spat *s, *q;
+
+ for(i = 0; patterns[i].action; i++){
+ for(p = patterns[i].regexps; p; p = p->next){
+ print("%s <REGEXP>\n", patterns[i].action);
+ if(p->alt)
+ print("Alt:");
+ for(s = p->alt; s; s = s->next)
+ print("\t%s\n", s->string);
+ }
+ p = patterns[i].strings;
+ if(p == 0)
+ continue;
+
+ for(j = 0; j < Nhash; j++){
+ for(s = p->spat[j]; s; s = s->next){
+ print("%s %s\n", patterns[i].action, s->string);
+ if(s->alt)
+ print("Alt:");
+ for(q = s->alt; q; q = q->next)
+ print("\t%s\n", q->string);
+ }
+ }
+ }
+}
+
+void
+main(int argc, char *argv[])
+{
+ int i, fd, n, aflag, vflag;
+ char body[Bodysize+2], *raw, *ret;
+ Biobuf *bp;
+
+ sprint(patfile, "%s/patterns", UPASLIB);
+ aflag = -1;
+ vflag = 0;
+ ARGBEGIN {
+ case 'a':
+ aflag = 1;
+ break;
+ case 'v':
+ vflag = 1;
+ break;
+ case 'd':
+ debug++;
+ break;
+ case 'p':
+ strcpy(patfile,ARGF());
+ break;
+ } ARGEND
+
+ bp = Bopen(patfile, OREAD);
+ if(bp){
+ parsepats(bp);
+ Bterm(bp);
+ }
+
+ if(argc >= 1){
+ fd = open(*argv, OREAD);
+ if(fd < 0){
+ fprint(2, "can't open %s\n", *argv);
+ exits("open");
+ }
+ Binit(&bin, fd, OREAD);
+ } else
+ Binit(&bin, 0, OREAD);
+
+ *body = 0;
+ *header = 0;
+ ret = 0;
+ for(;;){
+ raw = canon(&bin, header+1, body+1, &n);
+ if(raw == 0)
+ break;
+ if(aflag == 0)
+ continue;
+ if(aflag < 0)
+ aflag = 0;
+ if(vflag){
+ if(header[1]) {
+ fprint(2, "\t**** Header ****\n\n");
+ write(2, header+1, strlen(header+1));
+ fprint(2, "\n");
+ }
+ fprint(2, "\t**** Body ****\n\n");
+ if(body[1])
+ write(2, body+1, strlen(body+1));
+ fprint(2, "\n");
+ }
+
+ for(i = 0; patterns[i].action; i++){
+ if(matchaction(&patterns[i], header+1))
+ ret = patterns[i].action;
+ if(i == HoldHeader)
+ continue;
+ if(matchaction(&patterns[i], body+1))
+ ret = patterns[i].action;
+ }
+ }
+ exits(ret);
+}
+
+char*
+canon(Biobuf *bp, char *header, char *body, int *n)
+{
+ int hsize, base64;
+
+ static char *raw;
+
+ hsize = 0;
+ base64 = 0;
+ *header = 0;
+ *body = 0;
+ if(raw == 0){
+ raw = readmsg(bp, &hsize, n);
+ if(raw)
+ base64 = convert(raw, raw+hsize, header, Hdrsize, 0);
+ } else {
+ free(raw);
+ raw = readmsg(bp, 0, n);
+ }
+ if(raw){
+ if(base64)
+ conv64(raw+hsize, raw+*n, body, Bodysize);
+ else
+ convert(raw+hsize, raw+*n, body, Bodysize, 1);
+ }
+ return raw;
+}
+
+int
+matchaction(Patterns *pp, char *message)
+{
+ char *name, *cp;
+ int ret;
+ Pattern *p;
+ Resub m[1];
+
+ if(message == 0 || *message == 0)
+ return 0;
+
+ name = pp->action;
+ p = pp->strings;
+ ret = 0;
+ if(p)
+ for(cp = message; matcher(name, p, cp, m); cp = m[0].ep)
+ ret++;
+
+ for(p = pp->regexps; p; p = p->next)
+ for(cp = message; matcher(name, p, cp, m); cp = m[0].ep)
+ ret++;
+ return ret;
+}
+
+int
+matcher(char *action, Pattern *p, char *message, Resub *m)
+{
+ if(matchpat(p, message, m)){
+ if(p->action != Lineoff)
+ xprint(1, action, m);
+ return 1;
+ }
+ return 0;
+}