| /* |
| * RFC822 message tokenizer (really feature generator) for spam filter. |
| * |
| * See Paul Graham's musings on spam filtering for theory. |
| */ |
| |
| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #include <regexp.h> |
| #include <ctype.h> |
| #include "dfa.h" |
| |
| void buildre(Dreprog*[3]); |
| int debug; |
| char *refile = "#9/mail/lib/classify.re"; |
| int maxtoklen = 20; |
| int trim(char*); |
| |
| void |
| usage(void) |
| { |
| fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n"); |
| exits("usage"); |
| } |
| |
| void |
| main(int argc, char **argv) |
| { |
| int i, hdr, n, eof, off; |
| Dreprog *re[3]; |
| int m[3]; |
| char *p, *ep, *tag; |
| Biobuf bout, bin; |
| char msg[1024+1]; |
| char buf[1024]; |
| |
| refile = unsharp(refile); |
| buildre(re); |
| ARGBEGIN{ |
| case 'D': |
| debug = 1; |
| break; |
| case 'n': |
| maxtoklen = atoi(EARGF(usage())); |
| break; |
| case 'r': |
| refile = EARGF(usage()); |
| break; |
| default: |
| usage(); |
| }ARGEND; |
| |
| if(argc > 1) |
| usage(); |
| if(argc == 1){ |
| close(0); |
| if(open(argv[0], OREAD) < 0) |
| sysfatal("open %s: %r", argv[0]); |
| } |
| |
| tag = nil; |
| Binit(&bin, 0, OREAD); |
| Binit(&bout, 1, OWRITE); |
| ep = msg; |
| p = msg; |
| eof = 0; |
| off = 0; |
| hdr = 1; |
| for(;;){ |
| /* replenish buffer */ |
| if(ep - p < 512 && !eof){ |
| if(p > msg + 1){ |
| n = ep - p; |
| memmove(msg, p-1, ep-(p-1)); |
| off += (p-1) - msg; |
| p = msg+1; |
| ep = p + n; |
| } |
| n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep); |
| if(n < 0) |
| sysfatal("read error: %r"); |
| if(n == 0) |
| eof = 1; |
| ep += n; |
| *ep = 0; |
| } |
| if(p >= ep) |
| break; |
| |
| if(*p == 0){ |
| p++; |
| continue; |
| } |
| |
| if(hdr && p[-1]=='\n'){ |
| if(p[0]=='\n') |
| hdr = 0; |
| else if(cistrncmp(p-1, "\nfrom:", 6) == 0) |
| tag = "From*"; |
| else if(cistrncmp(p-1, "\nto:", 4) == 0) |
| tag = "To*"; |
| else if(cistrncmp(p-1, "\nsubject:", 9) == 0) |
| tag = "Subject*"; |
| else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0) |
| tag = "Return-Path*"; |
| else |
| tag = nil; |
| } |
| m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n'); |
| m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n'); |
| m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n'); |
| |
| n = m[0]; |
| if(n < m[1]) |
| n = m[1]; |
| if(n < m[2]) |
| n = m[2]; |
| if(n <= 0){ |
| fprint(2, "«%s» %.2ux", p, p[0]); |
| sysfatal("no regexps matched at %ld", off + (p-msg)); |
| } |
| |
| if(m[0] >= m[1] && m[0] >= m[2]){ |
| /* "From " marks start of new message */ |
| Bprint(&bout, "*From*\n"); |
| n = m[0]; |
| hdr = 1; |
| }else if(m[2] > 1){ |
| /* ignore */ |
| n = m[2]; |
| }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){ |
| /* keyword */ |
| /* should do UTF-aware lowercasing, too much bother */ |
| /* |
| for(i=0; i<n; i++) |
| if('A' <= p[i] && p[i] <= 'Z') |
| p[i] += 'a' - 'A'; |
| */ |
| if(tag){ |
| i = strlen(tag); |
| memmove(buf, tag, i); |
| memmove(buf+i, p, m[1]); |
| buf[i+m[1]] = 0; |
| }else{ |
| memmove(buf, p, m[1]); |
| buf[m[1]] = 0; |
| } |
| Bprint(&bout, "%s\n", buf); |
| while(trim(buf) >= 0) |
| Bprint(&bout, "stem*%s\n", buf); |
| n = m[1]; |
| }else |
| n = m[2]; |
| if(debug) |
| fprint(2, "%.*s¦", utfnlen(p, n), p); |
| p += n; |
| } |
| Bterm(&bout); |
| exits(0); |
| } |
| |
| void |
| buildre(Dreprog *re[3]) |
| { |
| Biobuf *b; |
| |
| if((b = Bopen(refile, OREAD)) == nil) |
| sysfatal("open %s: %r", refile); |
| |
| re[0] = Breaddfa(b); |
| re[1] = Breaddfa(b); |
| re[2] = Breaddfa(b); |
| |
| if(re[0]==nil || re[1]==nil || re[2]==nil) |
| sysfatal("Breaddfa: %r"); |
| Bterm(b); |
| } |
| |
| /* perhaps this belongs in the tokenizer */ |
| int |
| trim(char *s) |
| { |
| char *p, *op; |
| int mix, mix1; |
| |
| if(*s == '*') |
| return -1; |
| |
| /* strip leading punctuation */ |
| p = strchr(s, '*'); |
| if(p == nil) |
| p = s; |
| while(*p && !isalpha(*p)) |
| p++; |
| if(strlen(p) < 2) |
| { |
| return -1; |
| } |
| memmove(s, p, strlen(p)+1); |
| |
| /* strip suffix of punctuation */ |
| p = s+strlen(s); |
| op = p; |
| while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1])) |
| p--; |
| |
| /* chop punctuation */ |
| if(p > s){ |
| /* free!!! -> free! */ |
| if(p+1 < op){ |
| p[1] = 0; |
| return 0; |
| } |
| /* free! -> free */ |
| if(p < op){ |
| p[0] = 0; |
| return 0; |
| } |
| } |
| |
| mix = mix1 = 0; |
| if(isupper(s[0])) |
| mix = 1; |
| for(p=s+1; *p; p++) |
| if(isupper(*p)){ |
| mix1 = 1; |
| break; |
| } |
| |
| /* turn FREE into Free */ |
| if(mix1){ |
| for(p=s+1; *p; p++) |
| if(isupper(*p)) |
| *p += 'a'-'A'; |
| return 0; |
| } |
| |
| /* turn Free into free */ |
| if(mix){ |
| *s += 'a'-'A'; |
| return 0; |
| } |
| return -1; |
| } |
| |