blob: 122e0c64606aa22a1e02bca5c3c83d8abbba46d0 [file] [log] [blame]
/*
* RFC822 message tokenizer (really feature generator) for spam filter.
*
* See Paul Graham's musings on spam filtering for theory.
*/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include <ctype.h>
#include "dfa.h"
void buildre(Dreprog*[3]);
int debug;
char *refile = "#9/mail/lib/classify.re";
int maxtoklen = 20;
int trim(char*);
void
usage(void)
{
fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
exits("usage");
}
void
main(int argc, char **argv)
{
int i, hdr, n, eof, off;
Dreprog *re[3];
int m[3];
char *p, *ep, *tag;
Biobuf bout, bin;
char msg[1024+1];
char buf[1024];
refile = unsharp(refile);
buildre(re);
ARGBEGIN{
case 'D':
debug = 1;
break;
case 'n':
maxtoklen = atoi(EARGF(usage()));
break;
case 'r':
refile = EARGF(usage());
break;
default:
usage();
}ARGEND;
if(argc > 1)
usage();
if(argc == 1){
close(0);
if(open(argv[0], OREAD) < 0)
sysfatal("open %s: %r", argv[0]);
}
tag = nil;
Binit(&bin, 0, OREAD);
Binit(&bout, 1, OWRITE);
ep = msg;
p = msg;
eof = 0;
off = 0;
hdr = 1;
for(;;){
/* replenish buffer */
if(ep - p < 512 && !eof){
if(p > msg + 1){
n = ep - p;
memmove(msg, p-1, ep-(p-1));
off += (p-1) - msg;
p = msg+1;
ep = p + n;
}
n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
if(n < 0)
sysfatal("read error: %r");
if(n == 0)
eof = 1;
ep += n;
*ep = 0;
}
if(p >= ep)
break;
if(*p == 0){
p++;
continue;
}
if(hdr && p[-1]=='\n'){
if(p[0]=='\n')
hdr = 0;
else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
tag = "From*";
else if(cistrncmp(p-1, "\nto:", 4) == 0)
tag = "To*";
else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
tag = "Subject*";
else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
tag = "Return-Path*";
else
tag = nil;
}
m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
n = m[0];
if(n < m[1])
n = m[1];
if(n < m[2])
n = m[2];
if(n <= 0){
fprint(2, "«%s» %.2ux", p, p[0]);
sysfatal("no regexps matched at %ld", off + (p-msg));
}
if(m[0] >= m[1] && m[0] >= m[2]){
/* "From " marks start of new message */
Bprint(&bout, "*From*\n");
n = m[0];
hdr = 1;
}else if(m[2] > 1){
/* ignore */
n = m[2];
}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
/* keyword */
/* should do UTF-aware lowercasing, too much bother */
/*
for(i=0; i<n; i++)
if('A' <= p[i] && p[i] <= 'Z')
p[i] += 'a' - 'A';
*/
if(tag){
i = strlen(tag);
memmove(buf, tag, i);
memmove(buf+i, p, m[1]);
buf[i+m[1]] = 0;
}else{
memmove(buf, p, m[1]);
buf[m[1]] = 0;
}
Bprint(&bout, "%s\n", buf);
while(trim(buf) >= 0)
Bprint(&bout, "stem*%s\n", buf);
n = m[1];
}else
n = m[2];
if(debug)
fprint(2, "%.*s¦", utfnlen(p, n), p);
p += n;
}
Bterm(&bout);
exits(0);
}
void
buildre(Dreprog *re[3])
{
Biobuf *b;
if((b = Bopen(refile, OREAD)) == nil)
sysfatal("open %s: %r", refile);
re[0] = Breaddfa(b);
re[1] = Breaddfa(b);
re[2] = Breaddfa(b);
if(re[0]==nil || re[1]==nil || re[2]==nil)
sysfatal("Breaddfa: %r");
Bterm(b);
}
/* perhaps this belongs in the tokenizer */
int
trim(char *s)
{
char *p, *op;
int mix, mix1;
if(*s == '*')
return -1;
/* strip leading punctuation */
p = strchr(s, '*');
if(p == nil)
p = s;
while(*p && !isalpha(*p))
p++;
if(strlen(p) < 2)
{
return -1;
}
memmove(s, p, strlen(p)+1);
/* strip suffix of punctuation */
p = s+strlen(s);
op = p;
while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
p--;
/* chop punctuation */
if(p > s){
/* free!!! -> free! */
if(p+1 < op){
p[1] = 0;
return 0;
}
/* free! -> free */
if(p < op){
p[0] = 0;
return 0;
}
}
mix = mix1 = 0;
if(isupper(s[0]))
mix = 1;
for(p=s+1; *p; p++)
if(isupper(*p)){
mix1 = 1;
break;
}
/* turn FREE into Free */
if(mix1){
for(p=s+1; *p; p++)
if(isupper(*p))
*p += 'a'-'A';
return 0;
}
/* turn Free into free */
if(mix){
*s += 'a'-'A';
return 0;
}
return -1;
}