| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #include <ctype.h> |
| #include "msgdb.h" |
| |
| void |
| usage(void) |
| { |
| fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n"); |
| exits("usage"); |
| } |
| |
| enum |
| { |
| MAXBEST = 32, |
| MAXLEN = 64, |
| MAXTAB = 256, |
| }; |
| |
| typedef struct Ndb Ndb; |
| struct Ndb |
| { |
| char *name; |
| char *file; |
| Msgdb *db; |
| double p; |
| long nmsg; |
| }; |
| |
| typedef struct Word Word; |
| struct Word |
| { |
| char s[MAXLEN]; |
| int count[MAXTAB]; |
| double p[MAXTAB]; |
| double mp; |
| int mi; /* w.p[w.mi] = w.mp */ |
| int nmsg; |
| }; |
| |
| Ndb db[MAXTAB]; |
| int ndb; |
| |
| int add; |
| int mul; |
| Msgdb *indb; |
| |
| Word best[MAXBEST]; |
| int mbest = 15; |
| int nbest; |
| |
| void process(Biobuf*, char*); |
| void lockfile(char*); |
| |
| void |
| noteword(Word *w, char *s) |
| { |
| int i; |
| |
| for(i=nbest-1; i>=0; i--) |
| if(w->mp < best[i].mp) |
| break; |
| i++; |
| |
| if(i >= mbest) |
| return; |
| if(nbest == mbest) |
| nbest--; |
| if(i < nbest) |
| memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); |
| best[i] = *w; |
| strecpy(best[i].s, best[i].s+MAXLEN, s); |
| nbest++; |
| } |
| |
| void |
| main(int argc, char **argv) |
| { |
| int i, bad, m, tot, nn, j; |
| Biobuf bin, *b, bout; |
| char *s, *lf; |
| double totp, p, thresh; |
| long n; |
| Word w; |
| |
| lf = nil; |
| thresh = 0; |
| ARGBEGIN{ |
| case 'a': |
| add = 1; |
| break; |
| case 'd': |
| if(ndb >= MAXTAB) |
| sysfatal("too many db classes"); |
| db[ndb].name = EARGF(usage()); |
| db[ndb].file = EARGF(usage()); |
| ndb++; |
| break; |
| case 'l': |
| lf = EARGF(usage()); |
| break; |
| case 'm': |
| mul = atoi(EARGF(usage())); |
| break; |
| case 't': |
| thresh = atof(EARGF(usage())); |
| break; |
| default: |
| usage(); |
| }ARGEND |
| |
| if(ndb == 0){ |
| fprint(2, "must have at least one -d option\n"); |
| usage(); |
| } |
| |
| indb = mdopen(nil, 1); |
| if(argc == 0){ |
| Binit(&bin, 0, OREAD); |
| process(&bin, "<stdin>"); |
| Bterm(&bin); |
| }else{ |
| bad = 0; |
| for(i=0; i<argc; i++){ |
| if((b = Bopen(argv[i], OREAD)) == nil){ |
| fprint(2, "opening %s: %r\n", argv[i]); |
| bad = 1; |
| continue; |
| } |
| process(b, argv[i]); |
| Bterm(b); |
| } |
| if(bad) |
| exits("open inputs"); |
| } |
| |
| lockfile(lf); |
| bad = 0; |
| for(i=0; i<ndb; i++){ |
| if((db[i].db = mdopen(db[i].file, 0)) == nil){ |
| fprint(2, "opendb %s: %r\n", db[i].file); |
| bad = 1; |
| } |
| db[i].nmsg = mdget(db[i].db, "*From*"); |
| } |
| if(bad) |
| exits("open databases"); |
| |
| /* run conditional probabilities of input words, getting 15 most specific */ |
| mdenum(indb); |
| nbest = 0; |
| while(mdnext(indb, &s, &n) >= 0){ |
| tot = 0; |
| totp = 0.0; |
| for(i=0; i<ndb; i++){ |
| nn = mdget(db[i].db, s)*(i==0 ? 3 : 1); |
| tot += nn; |
| w.count[i] = nn; |
| p = w.count[i]/(double)db[i].nmsg; |
| if(p >= 1.0) |
| p = 1.0; |
| w.p[i] = p; |
| totp += p; |
| } |
| //fprint(2, "%s tot %d totp %g\n", s, tot, totp); |
| if(tot < 2) |
| continue; |
| w.mp = 0.0; |
| for(i=0; i<ndb; i++){ |
| p = w.p[i]; |
| p /= totp; |
| if(p < 0.001) |
| p = 0.001; |
| else if(p > 0.999) |
| p = 0.999; |
| if(p > w.mp){ |
| w.mp = p; |
| w.mi = i; |
| } |
| w.p[i] = p; |
| } |
| noteword(&w, s); |
| } |
| |
| /* compute conditional probabilities of message classes using 15 most specific */ |
| totp = 0.0; |
| for(i=0; i<ndb; i++){ |
| p = 1.0; |
| for(j=0; j<nbest; j++) |
| p *= best[j].p[i]; |
| db[i].p = p; |
| totp += p; |
| } |
| for(i=0; i<ndb; i++) |
| db[i].p /= totp; |
| m = 0; |
| for(i=1; i<ndb; i++) |
| if(db[i].p > db[m].p) |
| m = i; |
| |
| Binit(&bout, 1, OWRITE); |
| if(db[m].p < thresh) |
| m = -1; |
| if(m >= 0) |
| Bprint(&bout, "%s", db[m].name); |
| else |
| Bprint(&bout, "inconclusive"); |
| for(j=0; j<ndb; j++) |
| Bprint(&bout, " %s=%g", db[j].name, db[j].p); |
| Bprint(&bout, "\n"); |
| for(i=0; i<nbest; i++){ |
| Bprint(&bout, "%s", best[i].s); |
| for(j=0; j<ndb; j++) |
| Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]); |
| Bprint(&bout, "\n"); |
| } |
| Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]); |
| Bterm(&bout); |
| |
| if(m >= 0 && add){ |
| mdenum(indb); |
| while(mdnext(indb, &s, &n) >= 0) |
| mdput(db[m].db, s, mdget(db[m].db, s)+n*mul); |
| mdclose(db[m].db); |
| } |
| exits(nil); |
| } |
| |
| void |
| process(Biobuf *b, char*) |
| { |
| char *s; |
| char *p; |
| long n; |
| |
| while((s = Brdline(b, '\n')) != nil){ |
| s[Blinelen(b)-1] = 0; |
| if((p = strrchr(s, ' ')) != nil){ |
| *p++ = 0; |
| n = atoi(p); |
| }else |
| n = 1; |
| mdput(indb, s, mdget(indb, s)+n); |
| } |
| } |
| |
| int tpid; |
| void |
| killtickle(void) |
| { |
| postnote(PNPROC, tpid, "die"); |
| } |
| |
| void |
| lockfile(char *s) |
| { |
| int fd, t, w; |
| char err[ERRMAX]; |
| |
| if(s == nil) |
| return; |
| w = 50; |
| t = 0; |
| for(;;){ |
| fd = open(s, OREAD); |
| if(fd >= 0) |
| break; |
| rerrstr(err, sizeof err); |
| if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil)) |
| break; |
| sleep(w); |
| t += w; |
| if(w < 1000) |
| w = (w*3)/2; |
| if(t > 120*1000) |
| break; |
| } |
| if(fd < 0) |
| sysfatal("could not lock %s", s); |
| switch(tpid = fork()){ |
| case -1: |
| sysfatal("fork: %r"); |
| case 0: |
| for(;;){ |
| sleep(30*1000); |
| free(dirfstat(fd)); |
| } |
| _exits(nil); |
| default: |
| break; |
| } |
| close(fd); |
| atexit(killtickle); |
| } |
| |