diff options
Diffstat (limited to 'src/cmd/upas/scanmail')
-rw-r--r-- | src/cmd/upas/scanmail/common.c | 667 | ||||
-rw-r--r-- | src/cmd/upas/scanmail/mkfile | 24 | ||||
-rw-r--r-- | src/cmd/upas/scanmail/scanmail.c | 476 | ||||
-rw-r--r-- | src/cmd/upas/scanmail/spam.h | 62 | ||||
-rw-r--r-- | src/cmd/upas/scanmail/testscan.c | 212 |
5 files changed, 1441 insertions, 0 deletions
diff --git a/src/cmd/upas/scanmail/common.c b/src/cmd/upas/scanmail/common.c new file mode 100644 index 00000000..b6ea720d --- /dev/null +++ b/src/cmd/upas/scanmail/common.c @@ -0,0 +1,667 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <regexp.h> +#include "spam.h" + +enum { + Quanta = 8192, + Minbody = 6000, + HdrMax = 15, +}; + +typedef struct keyword Keyword; +typedef struct word Word; + +struct word{ + char *string; + int n; +}; + +struct keyword{ + char *string; + int value; +}; + +Word htmlcmds[] = +{ + "html", 4, + "!doctype html", 13, + 0, + +}; + +Word hrefs[] = +{ + "a href=", 7, + "a title=", 8, + "a target=", 9, + "base href=", 10, + "img src=", 8, + "img border=", 11, + "form action=", 12, + "!--", 3, + 0, + +}; + +/* + * RFC822 header keywords to look for for fractured header. + * all lengths must be less than HdrMax defined above. + */ +Word hdrwords[] = +{ + "cc:", 3, + "bcc:", 4, + "to:", 3, + 0, 0, + +}; + +Keyword keywords[] = +{ + "header", HoldHeader, + "line", SaveLine, + "hold", Hold, + "dump", Dump, + "loff", Lineoff, + 0, Nactions, +}; + +Patterns patterns[] = { +[Dump] { "DUMP:", 0, 0 }, +[HoldHeader] { "HEADER:", 0, 0 }, +[Hold] { "HOLD:", 0, 0 }, +[SaveLine] { "LINE:", 0, 0 }, +[Lineoff] { "LINEOFF:", 0, 0 }, +[Nactions] { 0, 0, 0 }, +}; + +static char* endofhdr(char*, char*); +static int escape(char**); +static int extract(char*); +static int findkey(char*); +static int hash(int); +static int isword(Word*, char*, int); +static void parsealt(Biobuf*, char*, Spat**); + +/* + * The canonicalizer: convert input to canonical representation + */ +char* +readmsg(Biobuf *bp, int *hsize, int *bufsize) +{ + char *p, *buf; + int n, offset, eoh, bsize, delta; + + buf = 0; + offset = 0; + if(bufsize) + *bufsize = 0; + if(hsize) + *hsize = 0; + for(;;) { + buf = Realloc(buf, offset+Quanta+1); + n = Bread(bp, buf+offset, Quanta); + if(n < 0){ + free(buf); + return 0; + } + p = buf+offset; /* start of this chunk */ + offset += n; /* end of this chunk */ + buf[offset] = 0; + if(n == 0){ + if(offset == 0) + return 0; + break; + } + + if(hsize == 0) /* don't process header */ + break; + if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */ + p--; + p = endofhdr(p, buf+offset); + if(p) + break; + if(offset >= Maxread) /* gargantuan header - just punt*/ + { + if(hsize) + *hsize = offset; + if(bufsize) + *bufsize = offset; + return buf; + } + } + eoh = p-buf; /* End of header */ + bsize = offset - eoh; /* amount of body already read */ + + /* Read at least Minbody bytes of the body */ + if (bsize < Minbody){ + delta = Minbody-bsize; + buf = Realloc(buf, offset+delta+1); + n = Bread(bp, buf+offset, delta); + if(n > 0) { + offset += n; + buf[offset] = 0; + } + } + if(hsize) + *hsize = eoh; + if(bufsize) + *bufsize = offset; + return buf; +} + +static int +isword(Word *wp, char *text, int len) +{ + for(;wp->string; wp++) + if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0) + return 1; + return 0; +} + +static char* +endofhdr(char *raw, char *end) +{ + int i; + char *p, *q; + char buf[HdrMax]; + + /* + * can't use strchr to search for newlines because + * there may be embedded NULL's. + */ + for(p = raw; p < end; p++){ + if(*p != '\n' || p[1] != '\n') + continue; + p++; + for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){ + buf[i++] = tolower(*q); + if(*q == ':' || *q == '\n') + break; + } + if(!isword(hdrwords, buf, i)) + return p+1; + } + return 0; +} + +static int +htmlmatch(Word *wp, char *text, char *end, int *n) +{ + char *cp; + int i, c, lastc; + char buf[MaxHtml]; + + /* + * extract a string up to '>' + */ + + i = lastc = 0; + cp = text; + while (cp < end && i < sizeof(buf)-1){ + c = *cp++; + if(c == '=') + c = escape(&cp); + switch(c){ + case 0: + case '\r': + continue; + case '>': + goto out; + case '\n': + case ' ': + case '\t': + if(lastc == ' ') + continue; + c = ' '; + break; + default: + c = tolower(c); + break; + } + buf[i++] = lastc = c; + } +out: + buf[i] = 0; + if(n) + *n = cp-text; + return isword(wp, buf, i); +} + +static int +escape(char **msg) +{ + int c; + char *p; + + p = *msg; + c = *p; + if(c == '\n'){ + p++; + c = *p++; + } else + if(c == '2'){ + c = tolower(p[1]); + if(c == 'e'){ + p += 2; + c = '.'; + }else + if(c == 'f'){ + p += 2; + c = '/'; + }else + if(c == '0'){ + p += 2; + c = ' '; + } + else c = '='; + } else { + if(c == '3' && tolower(p[1]) == 'd') + p += 2; + c = '='; + } + *msg = p; + return c; +} + +static int +htmlchk(char **msg, char *end) +{ + int n; + char *p; + + static int ishtml; + + p = *msg; + if(ishtml == 0){ + ishtml = htmlmatch(htmlcmds, p, end, &n); + + /* If not an HTML keyword, check if it's + * an HTML comment (<!comment>). if so, + * skip over it; otherwise copy it in. + */ + if(ishtml == 0 && *p != '!') /* not comment */ + return '<'; /* copy it */ + + } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */ + return '<'; /* copy it */ + + /* + * this is an uninteresting HTML command; skip over it. + */ + p += n; + *msg = p+1; + return *p; +} + +/* + * decode a base 64 encode body + */ +void +conv64(char *msg, char *end, char *buf, int bufsize) +{ + int len, i; + char *cp; + + len = end - msg; + i = (len*3)/4+1; // room for max chars + null + cp = Malloc(i); + len = dec64((uchar*)cp, i, msg, len); + convert(cp, cp+len, buf, bufsize, 1); + free(cp); +} + +int +convert(char *msg, char *end, char *buf, int bufsize, int isbody) +{ + + char *p; + int c, lastc, base64; + + lastc = 0; + base64 = 0; + while(msg < end && bufsize > 0){ + c = *msg++; + + /* + * In the body only, try to strip most HTML and + * replace certain MIME escape sequences with the character + */ + if(isbody) { + do{ + p = msg; + if(c == '<') + c = htmlchk(&msg, end); + if(c == '=') + c = escape(&msg); + } while(p != msg && p < end); + } + switch(c){ + case 0: + case '\r': + continue; + case '\t': + case ' ': + case '\n': + if(lastc == ' ') + continue; + c = ' '; + break; + case 'C': /* check for MIME base 64 encoding in header */ + case 'c': + if(isbody == 0) + if(msg < end-32 && *msg == 'o' && msg[1] == 'n') + if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0) + base64 = 1; + c = 'c'; + break; + default: + c = tolower(c); + break; + } + *buf++ = c; + lastc = c; + bufsize--; + } + *buf = 0; + return base64; +} + +/* + * The pattern parser: build data structures from the pattern file + */ + +static int +hash(int c) +{ + return c & 127; +} + +static int +findkey(char *val) +{ + Keyword *kp; + + for(kp = keywords; kp->string; kp++) + if(strcmp(val, kp->string) == 0) + break; + return kp->value; +} + +#define whitespace(c) ((c) == ' ' || (c) == '\t') + +void +parsepats(Biobuf *bp) +{ + Pattern *p, *new; + char *cp, *qp; + int type, action, n, h; + Spat *spat; + + for(;;){ + cp = Brdline(bp, '\n'); + if(cp == 0) + break; + cp[Blinelen(bp)-1] = 0; + while(*cp == ' ' || *cp == '\t') + cp++; + if(*cp == '#' || *cp == 0) + continue; + type = regexp; + if(*cp == '*'){ + type = string; + cp++; + } + qp = strchr(cp, ':'); + if(qp == 0) + continue; + *qp = 0; + if(debug) + fprint(2, "action = %s\n", cp); + action = findkey(cp); + if(action >= Nactions) + continue; + cp = qp+1; + n = extract(cp); + if(n <= 0 || *cp == 0) + continue; + + qp = strstr(cp, "~~"); + if(qp){ + *qp = 0; + n = strlen(cp); + } + if(debug) + fprint(2, " Pattern: `%s'\n", cp); + + /* Hook regexps into a chain */ + if(type == regexp) { + new = Malloc(sizeof(Pattern)); + new->action = action; + new->pat = regcomp(cp); + if(new->pat == 0){ + free(new); + continue; + } + new->type = regexp; + new->alt = 0; + new->next = 0; + + if(qp) + parsealt(bp, qp+2, &new->alt); + + new->next = patterns[action].regexps; + patterns[action].regexps = new; + continue; + + } + /* not a Regexp - hook strings into Pattern hash chain */ + spat = Malloc(sizeof(*spat)); + spat->next = 0; + spat->alt = 0; + spat->len = n; + spat->string = Malloc(n+1); + spat->c1 = cp[1]; + strcpy(spat->string, cp); + + if(qp) + parsealt(bp, qp+2, &spat->alt); + + p = patterns[action].strings; + if(p == 0) { + p = Malloc(sizeof(Pattern)); + memset(p, 0, sizeof(*p)); + p->action = action; + p->type = string; + patterns[action].strings = p; + } + h = hash(*spat->string); + spat->next = p->spat[h]; + p->spat[h] = spat; + } +} + +static void +parsealt(Biobuf *bp, char *cp, Spat** head) +{ + char *p; + Spat *alt; + + while(cp){ + if(*cp == 0){ /*escaped newline*/ + do{ + cp = Brdline(bp, '\n'); + if(cp == 0) + return; + cp[Blinelen(bp)-1] = 0; + } while(extract(cp) <= 0 || *cp == 0); + } + + p = cp; + cp = strstr(p, "~~"); + if(cp){ + *cp = 0; + cp += 2; + } + if(strlen(p)){ + alt = Malloc(sizeof(*alt)); + alt->string = strdup(p); + alt->next = *head; + *head = alt; + } + } +} + +static int +extract(char *cp) +{ + int c; + char *p, *q, *r; + + p = q = r = cp; + while(whitespace(*p)) + p++; + while(c = *p++){ + if (c == '#') + break; + if(c == '"'){ + while(*p && *p != '"'){ + if(*p == '\\' && p[1] == '"') + p++; + if('A' <= *p && *p <= 'Z') + *q++ = *p++ + ('a'-'A'); + else + *q++ = *p++; + } + if(*p) + p++; + r = q; /* never back up over a quoted string */ + } else { + if('A' <= c && c <= 'Z') + c += ('a'-'A'); + *q++ = c; + } + } + while(q > r && whitespace(q[-1])) + q--; + *q = 0; + return q-cp; +} + +/* + * The matching engine: compare canonical input to pattern structures + */ + +static Spat* +isalt(char *message, Spat *alt) +{ + while(alt) { + if(*cmd) + if(message != cmd && strstr(cmd, alt->string)) + break; + if(message != header+1 && strstr(header+1, alt->string)) + break; + if(strstr(message, alt->string)) + break; + alt = alt->next; + } + return alt; +} + +int +matchpat(Pattern *p, char *message, Resub *m) +{ + Spat *spat; + char *s; + int c, c1; + + if(p->type == string){ + c1 = *message; + for(s=message; c=c1; s++){ + c1 = s[1]; + for(spat=p->spat[hash(c)]; spat; spat=spat->next){ + if(c1 == spat->c1) + if(memcmp(s, spat->string, spat->len) == 0) + if(!isalt(message, spat->alt)){ + m->sp = s; + m->ep = s + spat->len; + return 1; + } + } + } + return 0; + } + m->sp = m->ep = 0; + if(regexec(p->pat, message, m, 1) == 0) + return 0; + if(isalt(message, p->alt)) + return 0; + return 1; +} + + +void +xprint(int fd, char *type, Resub *m) +{ + char *p, *q; + int i; + + if(m->sp == 0 || m->ep == 0) + return; + + /* back up approx 30 characters to whitespace */ + for(p = m->sp, i = 0; *p && i < 30; i++, p--) + ; + while(*p && *p != ' ') + p--; + p++; + + /* grab about 30 more chars beyond the end of the match */ + for(q = m->ep, i = 0; *q && i < 30; i++, q++) + ; + while(*q && *q != ' ') + q++; + + fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep); +} + +enum { + INVAL= 255 +}; + +static uchar t64d[256] = { +/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63, +/*30*/ 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, +/*50*/ 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL, +/*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, +/*70*/ 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL, +/*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +}; diff --git a/src/cmd/upas/scanmail/mkfile b/src/cmd/upas/scanmail/mkfile new file mode 100644 index 00000000..5f0db855 --- /dev/null +++ b/src/cmd/upas/scanmail/mkfile @@ -0,0 +1,24 @@ +</$objtype/mkfile + +TARG=scanmail\ + testscan + +OFILES= common.$O + +HFILES= spam.h\ + ../common/sys.h\ + +LIB= ../common/libcommon.a$O\ + +BIN=/$objtype/bin/upas +UPDATE=\ + mkfile\ + $HFILES\ + ${OFILES:%.$O=%.c}\ + ${TARG:%=%.c}\ + +</sys/src/cmd/mkmany +CFLAGS=$CFLAGS -I../common + +scanmail.$O: scanmail.c + $CC $CFLAGS -D'SPOOL="/mail"' scanmail.c diff --git a/src/cmd/upas/scanmail/scanmail.c b/src/cmd/upas/scanmail/scanmail.c new file mode 100644 index 00000000..444bbcdd --- /dev/null +++ b/src/cmd/upas/scanmail/scanmail.c @@ -0,0 +1,476 @@ +#include "common.h" +#include "spam.h" + +int cflag; +int debug; +int hflag; +int nflag; +int sflag; +int tflag; +int vflag; +Biobuf bin, bout, *cout; + + /* file names */ +char patfile[128]; +char linefile[128]; +char holdqueue[128]; +char copydir[128]; + +char header[Hdrsize+2]; +char cmd[1024]; +char **qname; +char **qdir; +char *sender; +String *recips; + +char* canon(Biobuf*, char*, char*, int*); +int matcher(char*, Pattern*, char*, Resub*); +int matchaction(int, char*, Resub*); +Biobuf *opencopy(char*); +Biobuf *opendump(char*); +char *qmail(char**, char*, int, Biobuf*); +void saveline(char*, char*, Resub*); +int optoutofspamfilter(char*); + +void +usage(void) +{ + fprint(2, "missing or bad arguments to qer\n"); + exits("usage"); +} + +void +regerror(char *s) +{ + fprint(2, "scanmail: %s\n", s); +} + +void * +Malloc(long n) +{ + void *p; + + p = malloc(n); + if(p == 0) + exits("malloc"); + return p; +} + +void* +Realloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == 0) + exits("realloc"); + return p; +} + +void +main(int argc, char *argv[]) +{ + int i, n, nolines, optout; + char **args, **a, *cp, *buf; + char body[Bodysize+2]; + Resub match[1]; + Biobuf *bp; + + optout = 1; + a = args = Malloc((argc+1)*sizeof(char*)); + sprint(patfile, "%s/patterns", UPASLIB); + sprint(linefile, "%s/lines", UPASLOG); + sprint(holdqueue, "%s/queue.hold", SPOOL); + sprint(copydir, "%s/copy", SPOOL); + + *a++ = argv[0]; + for(argc--, argv++; argv[0] && argv[0][0] == '-'; argc--, argv++){ + switch(argv[0][1]){ + case 'c': /* save copy of message */ + cflag = 1; + break; + case 'd': /* debug */ + debug++; + *a++ = argv[0]; + break; + case 'h': /* queue held messages by sender domain */ + hflag = 1; /* -q flag must be set also */ + break; + case 'n': /* NOHOLD mode */ + nflag = 1; + break; + case 'p': /* pattern file */ + if(argv[0][2] || argv[1] == 0) + usage(); + argc--; + argv++; + strecpy(patfile, patfile+sizeof patfile, *argv); + break; + case 'q': /* queue name */ + if(argv[0][2] || argv[1] == 0) + usage(); + *a++ = argv[0]; + argc--; + argv++; + qname = a; + *a++ = argv[0]; + break; + case 's': /* save copy of dumped message */ + sflag = 1; + break; + case 't': /* test mode - don't log match + * and write message to /dev/null + */ + tflag = 1; + break; + case 'v': /* vebose - print matches */ + vflag = 1; + break; + default: + *a++ = argv[0]; + break; + } + } + + if(argc < 3) + usage(); + + Binit(&bin, 0, OREAD); + bp = Bopen(patfile, OREAD); + if(bp){ + parsepats(bp); + Bterm(bp); + } + qdir = a; + sender = argv[2]; + + /* copy the rest of argv, acummulating the recipients as we go */ + for(i = 0; argv[i]; i++){ + *a++ = argv[i]; + if(i < 4) /* skip queue, 'mail', sender, dest sys */ + continue; + /* recipients and smtp flags - skip the latter*/ + if(strcmp(argv[i], "-g") == 0){ + *a++ = argv[++i]; + continue; + } + if(recips) + s_append(recips, ", "); + else + recips = s_new(); + s_append(recips, argv[i]); + if(optout && !optoutofspamfilter(argv[i])) + optout = 0; + } + *a = 0; + /* construct a command string for matching */ + snprint(cmd, sizeof(cmd)-1, "%s %s", sender, s_to_c(recips)); + cmd[sizeof(cmd)-1] = 0; + for(cp = cmd; *cp; cp++) + *cp = tolower(*cp); + + /* canonicalize a copy of the header and body. + * buf points to orginal message and n contains + * number of bytes of original message read during + * canonicalization. + */ + *body = 0; + *header = 0; + buf = canon(&bin, header+1, body+1, &n); + if (buf == 0) + exits("read"); + + /* if all users opt out, don't try matches */ + if(optout){ + if(cflag) + cout = opencopy(sender); + exits(qmail(args, buf, n, cout)); + } + + /* Turn off line logging, if command line matches */ + nolines = matchaction(Lineoff, cmd, match); + + for(i = 0; patterns[i].action; i++){ + /* Lineoff patterns were already done above */ + if(i == Lineoff) + continue; + /* don't apply "Line" patterns if excluded above */ + if(nolines && i == SaveLine) + continue; + /* apply patterns to the sender/recips, header and body */ + if(matchaction(i, cmd, match)) + break; + if(matchaction(i, header+1, match)) + break; + if(i == HoldHeader) + continue; + if(matchaction(i, body+1, match)) + break; + } + if(cflag && patterns[i].action == 0) /* no match found - save msg */ + cout = opencopy(sender); + + exits(qmail(args, buf, n, cout)); +} + +char* +qmail(char **argv, char *buf, int n, Biobuf *cout) +{ + Waitmsg *status; + int i, pid, pipefd[2]; + char path[512]; + Biobuf *bp; + + pid = 0; + if(tflag == 0){ + if(pipe(pipefd) < 0) + exits("pipe"); + pid = fork(); + if(pid == 0){ + dup(pipefd[0], 0); + for(i = sysfiles(); i >= 3; i--) + close(i); + snprint(path, sizeof(path), "%s/qer", UPASBIN); + *argv=path; + exec(path, argv); + exits("exec"); + } + Binit(&bout, pipefd[1], OWRITE); + bp = &bout; + } else + bp = Bopen("/dev/null", OWRITE); + + while(n > 0){ + Bwrite(bp, buf, n); + if(cout) + Bwrite(cout, buf, n); + n = Bread(&bin, buf, sizeof(buf)-1); + } + Bterm(bp); + if(cout) + Bterm(cout); + if(tflag) + return 0; + + close(pipefd[1]); + close(pipefd[0]); + for(;;){ + status = wait(); + if(status == nil || status->pid == pid) + break; + free(status); + } + if(status == nil) + strcpy(buf, "wait failed"); + else{ + strcpy(buf, status->msg); + free(status); + } + return buf; +} + +char* +canon(Biobuf *bp, char *header, char *body, int *n) +{ + int hsize; + char *raw; + + hsize = 0; + *header = 0; + *body = 0; + raw = readmsg(bp, &hsize, n); + if(raw){ + if(convert(raw, raw+hsize, header, Hdrsize, 0)) + conv64(raw+hsize, raw+*n, body, Bodysize); /* base64 */ + else + convert(raw+hsize, raw+*n, body, Bodysize, 1); /* text */ + } + return raw; +} + +int +matchaction(int action, char *message, Resub *m) +{ + char *name; + Pattern *p; + + if(message == 0 || *message == 0) + return 0; + + name = patterns[action].action; + p = patterns[action].strings; + if(p) + if(matcher(name, p, message, m)) + return 1; + + for(p = patterns[action].regexps; p; p = p->next) + if(matcher(name, p, message, m)) + return 1; + return 0; +} + +int +matcher(char *action, Pattern *p, char *message, Resub *m) +{ + char *cp; + String *s; + + for(cp = message; matchpat(p, cp, m); cp = m->ep){ + switch(p->action){ + case SaveLine: + if(vflag) + xprint(2, action, m); + saveline(linefile, sender, m); + break; + case HoldHeader: + case Hold: + if(nflag) + continue; + if(vflag) + xprint(2, action, m); + *qdir = holdqueue; + if(hflag && qname){ + cp = strchr(sender, '!'); + if(cp){ + *cp = 0; + *qname = strdup(sender); + *cp = '!'; + } else + *qname = strdup(sender); + } + return 1; + case Dump: + if(vflag) + xprint(2, action, m); + *(m->ep) = 0; + if(!tflag){ + s = s_new(); + s_append(s, sender); + s = unescapespecial(s); + syslog(0, "smtpd", "Dumped %s [%s] to %s", s_to_c(s), m->sp, + s_to_c(s_restart(recips))); + s_free(s); + } + tflag = 1; + if(sflag) + cout = opendump(sender); + return 1; + default: + break; + } + } + return 0; +} + +void +saveline(char *file, char *sender, Resub *rp) +{ + char *p, *q; + int i, c; + Biobuf *bp; + + if(rp->sp == 0 || rp->ep == 0) + return; + /* back up approx 20 characters to whitespace */ + for(p = rp->sp, i = 0; *p && i < 20; i++, p--) + ; + while(*p && *p != ' ') + p--; + p++; + + /* grab about 20 more chars beyond the end of the match */ + for(q = rp->ep, i = 0; *q && i < 20; i++, q++) + ; + while(*q && *q != ' ') + q++; + + c = *q; + *q = 0; + bp = sysopen(file, "al", 0644); + if(bp){ + Bprint(bp, "%s-> %s\n", sender, p); + Bterm(bp); + } + else if(debug) + fprint(2, "can't save line: (%s) %s\n", sender, p); + *q = c; +} + +Biobuf* +opendump(char *sender) +{ + int i; + ulong h; + char buf[512]; + Biobuf *b; + char *cp; + + cp = ctime(time(0)); + cp[7] = 0; + cp[10] = 0; + if(cp[8] == ' ') + sprint(buf, "%s/queue.dump/%s%c", SPOOL, cp+4, cp[9]); + else + sprint(buf, "%s/queue.dump/%s%c%c", SPOOL, cp+4, cp[8], cp[9]); + cp = buf+strlen(buf); + if(access(buf, 0) < 0 && sysmkdir(buf, 0777) < 0){ + syslog(0, "smtpd", "couldn't dump mail from %s: %r", sender); + return 0; + } + + h = 0; + while(*sender) + h = h*257 + *sender++; + for(i = 0; i < 50; i++){ + h += lrand(); + sprint(cp, "/%lud", h); + b = sysopen(buf, "wlc", 0644); + if(b){ + if(vflag) + fprint(2, "saving in %s\n", buf); + return b; + } + } + return 0; +} + +Biobuf* +opencopy(char *sender) +{ + int i; + ulong h; + char buf[512]; + Biobuf *b; + + h = 0; + while(*sender) + h = h*257 + *sender++; + for(i = 0; i < 50; i++){ + h += lrand(); + sprint(buf, "%s/%lud", copydir, h); + b = sysopen(buf, "wlc", 0600); + if(b) + return b; + } + return 0; +} + +int +optoutofspamfilter(char *addr) +{ + char *p, *f; + int rv; + + p = strchr(addr, '!'); + if(p) + p++; + else + p = addr; + + rv = 0; + f = smprint("/mail/box/%s/nospamfiltering", p); + if(f != nil){ + rv = access(f, 0)==0; + free(f); + } + + return rv; +} diff --git a/src/cmd/upas/scanmail/spam.h b/src/cmd/upas/scanmail/spam.h new file mode 100644 index 00000000..f1d24b2e --- /dev/null +++ b/src/cmd/upas/scanmail/spam.h @@ -0,0 +1,62 @@ + +enum{ + Dump = 0, /* Actions must be in order of descending importance */ + HoldHeader, + Hold, + SaveLine, + Lineoff, /* Lineoff must be the last action code */ + Nactions, + + Nhash = 128, + + regexp = 1, /* types: literal string or regular expression */ + string = 2, + + MaxHtml = 256, + Hdrsize = 4096, + Bodysize = 8192, + Maxread = 64*1024, +}; + +typedef struct spat Spat; +typedef struct pattern Pattern; +typedef struct patterns Patterns; +struct spat +{ + char* string; + int len; + int c1; + Spat* next; + Spat* alt; +}; + +struct pattern{ + struct pattern *next; + int action; + int type; + Spat* alt; + union{ + Reprog* pat; + Spat* spat[Nhash]; + }; +}; + +struct patterns { + char *action; + Pattern *strings; + Pattern *regexps; +}; + +extern int debug; +extern Patterns patterns[]; +extern char header[]; +extern char cmd[]; + +extern void conv64(char*, char*, char*, int); +extern int convert(char*, char*, char*, int, int); +extern void* Malloc(long n); +extern int matchpat(Pattern*, char*, Resub*); +extern char* readmsg(Biobuf*, int*, int*); +extern void parsepats(Biobuf*); +extern void* Realloc(void*, ulong); +extern void xprint(int, char*, Resub*); diff --git a/src/cmd/upas/scanmail/testscan.c b/src/cmd/upas/scanmail/testscan.c new file mode 100644 index 00000000..e5ea59ad --- /dev/null +++ b/src/cmd/upas/scanmail/testscan.c @@ -0,0 +1,212 @@ +#include "sys.h" +#include "spam.h" + +int debug; +Biobuf bin; +char patfile[128], header[Hdrsize+2]; +char cmd[1024]; + +char* canon(Biobuf*, char*, char*, int*); +int matcher(char *, Pattern*, char*, Resub*); +int matchaction(Patterns*, char*); + +void +usage(void) +{ + fprint(2, "missing or bad arguments to qer\n"); + exits("usage"); +} + +void * +Malloc(long n) +{ + void *p; + + p = malloc(n); + if(p == 0){ + fprint(2, "malloc error"); + exits("malloc"); + } + return p; +} + +void* +Realloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == 0){ + fprint(2, "realloc error"); + exits("realloc"); + } + return p; +} + +void +dumppats(void) +{ + int i, j; + Pattern *p; + Spat *s, *q; + + for(i = 0; patterns[i].action; i++){ + for(p = patterns[i].regexps; p; p = p->next){ + print("%s <REGEXP>\n", patterns[i].action); + if(p->alt) + print("Alt:"); + for(s = p->alt; s; s = s->next) + print("\t%s\n", s->string); + } + p = patterns[i].strings; + if(p == 0) + continue; + + for(j = 0; j < Nhash; j++){ + for(s = p->spat[j]; s; s = s->next){ + print("%s %s\n", patterns[i].action, s->string); + if(s->alt) + print("Alt:"); + for(q = s->alt; q; q = q->next) + print("\t%s\n", q->string); + } + } + } +} + +void +main(int argc, char *argv[]) +{ + int i, fd, n, aflag, vflag; + char body[Bodysize+2], *raw, *ret; + Biobuf *bp; + + sprint(patfile, "%s/patterns", UPASLIB); + aflag = -1; + vflag = 0; + ARGBEGIN { + case 'a': + aflag = 1; + break; + case 'v': + vflag = 1; + break; + case 'd': + debug++; + break; + case 'p': + strcpy(patfile,ARGF()); + break; + } ARGEND + + bp = Bopen(patfile, OREAD); + if(bp){ + parsepats(bp); + Bterm(bp); + } + + if(argc >= 1){ + fd = open(*argv, OREAD); + if(fd < 0){ + fprint(2, "can't open %s\n", *argv); + exits("open"); + } + Binit(&bin, fd, OREAD); + } else + Binit(&bin, 0, OREAD); + + *body = 0; + *header = 0; + ret = 0; + for(;;){ + raw = canon(&bin, header+1, body+1, &n); + if(raw == 0) + break; + if(aflag == 0) + continue; + if(aflag < 0) + aflag = 0; + if(vflag){ + if(header[1]) { + fprint(2, "\t**** Header ****\n\n"); + write(2, header+1, strlen(header+1)); + fprint(2, "\n"); + } + fprint(2, "\t**** Body ****\n\n"); + if(body[1]) + write(2, body+1, strlen(body+1)); + fprint(2, "\n"); + } + + for(i = 0; patterns[i].action; i++){ + if(matchaction(&patterns[i], header+1)) + ret = patterns[i].action; + if(i == HoldHeader) + continue; + if(matchaction(&patterns[i], body+1)) + ret = patterns[i].action; + } + } + exits(ret); +} + +char* +canon(Biobuf *bp, char *header, char *body, int *n) +{ + int hsize, base64; + + static char *raw; + + hsize = 0; + base64 = 0; + *header = 0; + *body = 0; + if(raw == 0){ + raw = readmsg(bp, &hsize, n); + if(raw) + base64 = convert(raw, raw+hsize, header, Hdrsize, 0); + } else { + free(raw); + raw = readmsg(bp, 0, n); + } + if(raw){ + if(base64) + conv64(raw+hsize, raw+*n, body, Bodysize); + else + convert(raw+hsize, raw+*n, body, Bodysize, 1); + } + return raw; +} + +int +matchaction(Patterns *pp, char *message) +{ + char *name, *cp; + int ret; + Pattern *p; + Resub m[1]; + + if(message == 0 || *message == 0) + return 0; + + name = pp->action; + p = pp->strings; + ret = 0; + if(p) + for(cp = message; matcher(name, p, cp, m); cp = m[0].ep) + ret++; + + for(p = pp->regexps; p; p = p->next) + for(cp = message; matcher(name, p, cp, m); cp = m[0].ep) + ret++; + return ret; +} + +int +matcher(char *action, Pattern *p, char *message, Resub *m) +{ + if(matchpat(p, message, m)){ + if(p->action != Lineoff) + xprint(1, action, m); + return 1; + } + return 0; +} |