|  | #include <u.h> | 
|  | #include <libc.h> | 
|  | #include <bio.h> | 
|  | #include <regexp.h> | 
|  | #include "dfa.h" | 
|  |  | 
|  | /*** | 
|  | * Regular expression for matching. | 
|  | */ | 
|  |  | 
|  | char *ignore[] = | 
|  | { | 
|  | /* HTML that isn't A, IMG, or FONT */ | 
|  | /* Must have a space somewhere to avoid catching <email@address> */ | 
|  | "<[ 	\n\r]*(" | 
|  | "[^aif]|" | 
|  | "a[^> \t\r\n]|" | 
|  | "i[^mM \t\r\n]|" | 
|  | "im[^gG \t\r\n]|" | 
|  | "img[^> \t\r\n]|" | 
|  | "f[^oO \t\r\n]|" | 
|  | "fo[^Nn \t\r\n]|" | 
|  | "fon[^tT \t\r\n]|" | 
|  | "font[^> \r\t\n]" | 
|  | ")[^>]*[ \t\n\r][^>]*>", | 
|  | "<[ 	\n\r]*(" | 
|  | "i|im|f|fo|fon" | 
|  | ")[ \t\r\n][^>]*>", | 
|  |  | 
|  | /* ignore html comments */ | 
|  | "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->", | 
|  |  | 
|  | /* random mail strings */ | 
|  | "^message-id:.*\n([ 	].*\n)*", | 
|  | "^in-reply-to:.*\n([ 	].*\n)*", | 
|  | "^references:.*\n([ 	].*\n)*", | 
|  | "^date:.*\n([ 	].*\n)*", | 
|  | "^delivery-date:.*\n([ 	].*\n)*", | 
|  | "e?smtp id .*", | 
|  | "^	id.*", | 
|  | "boundary=.*", | 
|  | "name=\"", | 
|  | "filename=\"", | 
|  | "news:<[^>]+>", | 
|  | "^--[^ 	]*$", | 
|  |  | 
|  | /* base64 encoding */ | 
|  | "^[0-9a-zA-Z+\\-=/]+$", | 
|  |  | 
|  | /* uu encoding */ | 
|  | "^[!-Z]+$", | 
|  |  | 
|  | /* little things */ | 
|  | ".", | 
|  | "\n" | 
|  | }; | 
|  |  | 
|  | char *keywords[] = | 
|  | { | 
|  | "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+" | 
|  | }; | 
|  |  | 
|  | int debug; | 
|  |  | 
|  | Dreprog* | 
|  | dregcomp(char *buf) | 
|  | { | 
|  | Reprog *r; | 
|  | Dreprog *d; | 
|  |  | 
|  | if(debug) | 
|  | print(">>> '%s'\n", buf); | 
|  |  | 
|  | r = regcomp(buf); | 
|  | if(r == nil) | 
|  | sysfatal("regcomp"); | 
|  | d = dregcvt(r); | 
|  | if(d == nil) | 
|  | sysfatal("dregcomp"); | 
|  | free(r); | 
|  | return d; | 
|  | } | 
|  |  | 
|  | char* | 
|  | strcpycase(char *d, char *s) | 
|  | { | 
|  | int cc, esc; | 
|  |  | 
|  | cc = 0; | 
|  | esc = 0; | 
|  | while(*s){ | 
|  | if(*s == '[') | 
|  | cc++; | 
|  | if(*s == ']') | 
|  | cc--; | 
|  | if(!cc && 'a' <= *s && *s <= 'z'){ | 
|  | *d++ = '['; | 
|  | *d++ = *s; | 
|  | *d++ = *s+'A'-'a'; | 
|  | *d++ = ']'; | 
|  | }else | 
|  | *d++ = *s; | 
|  | if(*s == '\\') | 
|  | esc++; | 
|  | else if(esc) | 
|  | esc--; | 
|  | s++; | 
|  | } | 
|  | return d; | 
|  | } | 
|  |  | 
|  | void | 
|  | regerror(char *msg) | 
|  | { | 
|  | sysfatal("regerror: %s", msg); | 
|  | } | 
|  |  | 
|  | void | 
|  | buildre(Dreprog *re[3]) | 
|  | { | 
|  | int i; | 
|  | static char buf[16384], *s; | 
|  |  | 
|  | re[0] = dregcomp("^From "); | 
|  |  | 
|  | s = buf; | 
|  | for(i=0; i<nelem(keywords); i++){ | 
|  | if(i != 0) | 
|  | *s++ = '|'; | 
|  | s = strcpycase(s, keywords[i]); | 
|  | } | 
|  | *s = 0; | 
|  | re[1] = dregcomp(buf); | 
|  |  | 
|  | s = buf; | 
|  | for(i=0; i<nelem(ignore); i++){ | 
|  | if(i != 0) | 
|  | *s++ = '|'; | 
|  | s = strcpycase(s, ignore[i]); | 
|  | } | 
|  | *s = 0; | 
|  | re[2] = dregcomp(buf); | 
|  | } | 
|  |  | 
|  | void | 
|  | usage(void) | 
|  | { | 
|  | fprint(2, "usage: regen [-d]\n"); | 
|  | exits("usage"); | 
|  | } | 
|  |  | 
|  | void | 
|  | main(int argc, char **argv) | 
|  | { | 
|  | Dreprog *re[3]; | 
|  | Biobuf b; | 
|  |  | 
|  | ARGBEGIN{ | 
|  | default: | 
|  | usage(); | 
|  | case 'd': | 
|  | debug = 1; | 
|  | }ARGEND | 
|  |  | 
|  | if(argc != 0) | 
|  | usage(); | 
|  |  | 
|  | buildre(re); | 
|  | Binit(&b, 1, OWRITE); | 
|  | Bprintdfa(&b, re[0]); | 
|  | Bprintdfa(&b, re[1]); | 
|  | Bprintdfa(&b, re[2]); | 
|  | exits(0); | 
|  | } | 
|  |  | 
|  |  |