| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #include <regexp.h> |
| #include <ctype.h> |
| #include "dict.h" |
| |
| /* |
| * Assumed index file structure: lines of form |
| * [^\t]+\t[0-9]+ |
| * First field is key, second is byte offset into dictionary. |
| * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2 |
| */ |
| typedef struct Addr Addr; |
| |
| struct Addr { |
| int n; /* number of offsets */ |
| int cur; /* current position within doff array */ |
| int maxn; /* actual current size of doff array */ |
| ulong doff[1]; /* doff[maxn], with 0..n-1 significant */ |
| }; |
| |
| Biobuf binbuf; |
| Biobuf boutbuf; |
| Biobuf *bin = &binbuf; /* user cmd input */ |
| Biobuf *bout = &boutbuf; /* output */ |
| Biobuf *bdict; /* dictionary */ |
| Biobuf *bindex; /* index file */ |
| long indextop; /* index offset at end of file */ |
| int lastcmd; /* last executed command */ |
| Addr *dot; /* "current" address */ |
| Dict *dict; /* current dictionary */ |
| int linelen; |
| int breaklen = 60; |
| int outinhibit; |
| int debug; |
| |
| void execcmd(int); |
| int getpref(char*, Rune*); |
| Entry getentry(int); |
| int getfield(Rune*); |
| long locate(Rune*); |
| int parseaddr(char*, char**); |
| int parsecmd(char*); |
| int search(char*, int); |
| long seeknextline(Biobuf*, long); |
| void setdotnext(void); |
| void setdotprev(void); |
| void sortaddr(Addr*); |
| void usage(void); |
| |
| enum { |
| Plen=300, /* max length of a search pattern */ |
| Fieldlen=200, /* max length of an index field */ |
| Aslots=10 /* initial number of slots in an address */ |
| }; |
| |
| void |
| main(int argc, char **argv) |
| { |
| int i, cmd, kflag; |
| char *line, *p; |
| |
| Binit(&binbuf, 0, OREAD); |
| Binit(&boutbuf, 1, OWRITE); |
| kflag = 0; |
| line = 0; |
| dict = 0; |
| |
| for(i=0; dicts[i].name; i++){ |
| if(access(dictfile(dicts[i].path), 0)>=0 && access(dictfile(dicts[i].indexpath), 0)>=0){ |
| dict = &dicts[i]; |
| break; |
| } |
| } |
| ARGBEGIN { |
| case 'd': |
| p = ARGF(); |
| dict = 0; |
| if(p) { |
| for(i=0; dicts[i].name; i++) |
| if(strcmp(p, dicts[i].name)==0) { |
| dict = &dicts[i]; |
| break; |
| } |
| } |
| if(!dict) |
| usage(); |
| break; |
| case 'c': |
| line = ARGF(); |
| if(!line) |
| usage(); |
| break; |
| case 'k': |
| kflag++; |
| break; |
| case 'D': |
| debug++; |
| break; |
| default: |
| usage(); |
| ARGEND } |
| if(dict == 0){ |
| err("no dictionaries present on this system"); |
| exits("nodict"); |
| } |
| |
| if(kflag) { |
| (*dict->printkey)(); |
| exits(0); |
| } |
| if(argc > 1) |
| usage(); |
| else if(argc == 1) { |
| if(line) |
| usage(); |
| p = argv[0]; |
| line = malloc(strlen(p)+5); |
| sprint(line, "/%s/P\n", p); |
| } |
| dict->path = dictfile(dict->path); |
| dict->indexpath = dictfile(dict->indexpath); |
| bdict = Bopen(dict->path, OREAD); |
| if(!bdict) { |
| err("can't open dictionary %s", dict->path); |
| exits("nodict"); |
| } |
| bindex = Bopen(dict->indexpath, OREAD); |
| if(!bindex) { |
| err("can't open index %s", dict->indexpath); |
| exits("noindex"); |
| } |
| indextop = Bseek(bindex, 0L, 2); |
| |
| dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong)); |
| dot->n = 0; |
| dot->cur = 0; |
| dot->maxn = Aslots; |
| lastcmd = 0; |
| |
| if(line) { |
| cmd = parsecmd(line); |
| if(cmd) |
| execcmd(cmd); |
| } else { |
| for(;;) { |
| Bprint(bout, "*"); |
| Bflush(bout); |
| line = Brdline(bin, '\n'); |
| linelen = 0; |
| if(!line) |
| break; |
| cmd = parsecmd(line); |
| if(cmd) { |
| execcmd(cmd); |
| lastcmd = cmd; |
| } |
| } |
| } |
| exits(0); |
| } |
| |
| void |
| usage(void) |
| { |
| int i; |
| char *a, *b; |
| |
| Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0); |
| Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n"); |
| for(i = 0; dicts[i].name; i++){ |
| a = b = ""; |
| if(access(dictfile(dicts[i].path), 0)<0 || access(dictfile(dicts[i].indexpath), 0)<0){ |
| a = "["; |
| b = "]"; |
| } |
| Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b); |
| } |
| exits("usage"); |
| } |
| |
| int |
| parsecmd(char *line) |
| { |
| char *e; |
| int cmd, ans; |
| |
| if(parseaddr(line, &e) >= 0) |
| line = e; |
| else |
| return 0; |
| cmd = *line; |
| ans = cmd; |
| if(isupper(cmd)) |
| cmd = tolower(cmd); |
| if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' || |
| cmd == '\n')) { |
| err("unknown command %c", cmd); |
| return 0; |
| } |
| if(cmd == '\n') |
| switch(lastcmd) { |
| case 0: ans = 'H'; break; |
| case 'H': ans = 'p'; break; |
| default : ans = lastcmd; break; |
| } |
| else if(line[1] != '\n' && line[1] != 0) |
| err("extra stuff after command %c ignored", cmd); |
| return ans; |
| } |
| |
| void |
| execcmd(int cmd) |
| { |
| Entry e; |
| int cur, doall; |
| |
| if(isupper(cmd)) { |
| doall = 1; |
| cmd = tolower(cmd); |
| cur = 0; |
| } else { |
| doall = 0; |
| cur = dot->cur; |
| } |
| if(debug && doall && cmd == 'a') |
| Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1); |
| for(;;){ |
| if(cur >= dot->n) |
| break; |
| if(doall) { |
| Bprint(bout, "%d\t", cur+1); |
| linelen += 4 + (cur >= 10); |
| } |
| switch(cmd) { |
| case 'a': |
| Bprint(bout, "#%lud\n", dot->doff[cur]); |
| break; |
| case 'h': |
| case 'p': |
| case 'r': |
| e = getentry(cur); |
| (*dict->printentry)(e, cmd); |
| break; |
| } |
| cur++; |
| if(doall) { |
| if(cmd == 'p' || cmd == 'r') { |
| Bputc(bout, '\n'); |
| linelen = 0; |
| } |
| } else |
| break; |
| } |
| if(cur >= dot->n) |
| cur = 0; |
| dot->cur = cur; |
| } |
| |
| /* |
| * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')* |
| * Answer goes in dot. |
| * Return -1 if address starts, but get error. |
| * Return 0 if no address. |
| */ |
| int |
| parseaddr(char *line, char **eptr) |
| { |
| int delim, plen; |
| ulong v; |
| char *e; |
| char pat[Plen]; |
| |
| if(*line == '/' || *line == '!') { |
| /* anchored regular expression match; '!' means no folding */ |
| if(*line == '/') { |
| delim = '/'; |
| e = strpbrk(line+1, "/\n"); |
| } else { |
| delim = '!'; |
| e = strpbrk(line+1, "!\n"); |
| } |
| plen = e-line-1; |
| if(plen >= Plen-3) { |
| err("pattern too big"); |
| return -1; |
| } |
| pat[0] = '^'; |
| memcpy(pat+1, line+1, plen); |
| pat[plen+1] = '$'; |
| pat[plen+2] = 0; |
| if(*e == '\n') |
| line = e; |
| else |
| line = e+1; |
| if(!search(pat, delim == '/')) { |
| err("pattern not found"); |
| return -1; |
| } |
| } else if(*line == '#') { |
| /* absolute byte offset into dictionary */ |
| line++; |
| if(!isdigit((uchar)*line)) |
| return -1; |
| v = strtoul(line, &e, 10); |
| line = e; |
| dot->doff[0] = v; |
| dot->n = 1; |
| dot->cur = 0; |
| } else if(isdigit((uchar)*line)) { |
| v = strtoul(line, &e, 10); |
| line = e; |
| if(v < 1 || v > dot->n) |
| err(".%d not in range [1,%d], ignored", |
| v, dot->n); |
| else |
| dot->cur = v-1; |
| } else if(*line == '.') { |
| line++; |
| } else { |
| *eptr = line; |
| return 0; |
| } |
| while(*line == '+' || *line == '-') { |
| if(*line == '+') |
| setdotnext(); |
| else |
| setdotprev(); |
| line++; |
| } |
| *eptr = line; |
| return 1; |
| } |
| |
| /* |
| * Index file is sorted by folded field1. |
| * Method: find pre, a folded prefix of r.e. pat, |
| * and then low = offset to beginning of |
| * line in index file where first match of prefix occurs. |
| * Then go through index until prefix no longer matches, |
| * adding each line that matches real pattern to dot. |
| * Finally, sort dot offsets (uniquing). |
| * We know pat len < Plen, and that it is surrounded by ^..$ |
| */ |
| int |
| search(char *pat, int dofold) |
| { |
| int needre, prelen, match, n; |
| Reprog *re; |
| long ioff, v; |
| Rune pre[Plen]; |
| Rune lit[Plen]; |
| Rune entry[Fieldlen]; |
| char fpat[Plen]; |
| |
| prelen = getpref(pat+1, pre); |
| if(pat[prelen+1] == 0 || pat[prelen+1] == '$') { |
| runescpy(lit, pre); |
| if(dofold) |
| fold(lit); |
| needre = 0; |
| SET(re); |
| } else { |
| needre = 1; |
| if(dofold) { |
| foldre(fpat, pat); |
| re = regcomp(fpat); |
| } else |
| re = regcomp(pat); |
| } |
| fold(pre); |
| ioff = locate(pre); |
| if(ioff < 0) |
| return 0; |
| dot->n = 0; |
| Bseek(bindex, ioff, 0); |
| for(;;) { |
| if(!getfield(entry)) |
| break; |
| if(dofold) |
| fold(entry); |
| if(needre) |
| match = rregexec(re, entry, 0, 0); |
| else |
| match = (acomp(lit, entry) == 0); |
| if(match) { |
| if(!getfield(entry)) |
| break; |
| v = runetol(entry); |
| if(dot->n >= dot->maxn) { |
| n = 2*dot->maxn; |
| dot = realloc(dot, |
| sizeof(Addr)+(n-1)*sizeof(long)); |
| if(!dot) { |
| err("out of memory"); |
| exits("nomem"); |
| } |
| dot->maxn = n; |
| } |
| dot->doff[dot->n++] = v; |
| } else { |
| if(!dofold) |
| fold(entry); |
| if(*pre) { |
| n = acomp(pre, entry); |
| if(n < -1 || (!needre && n < 0)) |
| break; |
| } |
| /* get to next index entry */ |
| if(!getfield(entry)) |
| break; |
| } |
| } |
| sortaddr(dot); |
| dot->cur = 0; |
| return dot->n; |
| } |
| |
| /* |
| * Return offset in index file of first line whose folded |
| * first field has pre as a prefix. -1 if none found. |
| */ |
| long |
| locate(Rune *pre) |
| { |
| long top, bot, mid; |
| Rune entry[Fieldlen]; |
| |
| if(*pre == 0) |
| return 0; |
| bot = 0; |
| top = indextop; |
| if(debug>1) |
| fprint(2, "locate looking for prefix %S\n", pre); |
| for(;;) { |
| /* |
| * Loop invariant: foldkey(bot) < pre <= foldkey(top) |
| * and bot < top, and bot,top point at beginning of lines |
| */ |
| mid = (top+bot) / 2; |
| mid = seeknextline(bindex, mid); |
| if(debug > 1) |
| fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n", |
| bot, (top+bot) / 2, mid, top); |
| if(mid == top || !getfield(entry)) |
| break; |
| if(debug > 1) |
| fprint(2, "key=%S\n", entry); |
| /* |
| * here mid is strictly between bot and top |
| */ |
| fold(entry); |
| if(acomp(pre, entry) <= 0) |
| top = mid; |
| else |
| bot = mid; |
| } |
| /* |
| * bot < top, but they don't necessarily point at successive lines |
| * Use linear search from bot to find first line that pre is a |
| * prefix of |
| */ |
| while((bot = seeknextline(bindex, bot)) <= top) { |
| if(!getfield(entry)) |
| return -1; |
| if(debug > 1) |
| fprint(2, "key=%S\n", entry); |
| fold(entry); |
| switch(acomp(pre, entry)) { |
| case -2: |
| return -1; |
| case -1: |
| case 0: |
| return bot; |
| case 1: |
| case 2: |
| continue; |
| } |
| } |
| return -1; |
| |
| } |
| |
| /* |
| * Get prefix of non re-metacharacters, runified, into pre, |
| * and return length |
| */ |
| int |
| getpref(char *pat, Rune *pre) |
| { |
| int n, r; |
| char *p; |
| |
| p = pat; |
| while(*p) { |
| n = chartorune(pre, p); |
| r = *pre; |
| switch(r) { |
| case 0x2e: case 0x2a: case 0x2b: case 0x3f: |
| case 0x5b: case 0x5d: case 0x28: case ')': |
| case 0x7c: case 0x5e: case 0x24: |
| *pre = 0; |
| return p-pat; |
| case '\\': |
| p += n; |
| p += chartorune(++pre, p); |
| pre++; |
| break; |
| default: |
| p += n; |
| pre++; |
| } |
| } |
| return p-pat; |
| } |
| |
| long |
| seeknextline(Biobuf *b, long off) |
| { |
| long c; |
| |
| Bseek(b, off, 0); |
| do { |
| c = Bgetrune(b); |
| } while(c>=0 && c!='\n'); |
| return Boffset(b); |
| } |
| |
| /* |
| * Get next field out of index file (either tab- or nl- terminated) |
| * Answer in *rp, assumed to be Fieldlen long. |
| * Return 0 if read error first. |
| */ |
| int |
| getfield(Rune *rp) |
| { |
| long c; |
| int n; |
| |
| for(n=Fieldlen; n-- > 0; ) { |
| if ((c = Bgetrune(bindex)) < 0) |
| return 0; |
| if(c == '\t' || c == '\n') { |
| *rp = '\0'; |
| return 1; |
| } |
| *rp++ = c; |
| } |
| err("word too long"); |
| return 0; |
| } |
| |
| /* |
| * A compare longs function suitable for qsort |
| */ |
| static int |
| longcmp(const void *av, const void *bv) |
| { |
| long v; |
| long *a, *b; |
| |
| a = (long*)av; |
| b = (long*)bv; |
| |
| v = *a - *b; |
| if(v < 0) |
| return -1; |
| else if(v == 0) |
| return 0; |
| else |
| return 1; |
| } |
| |
| void |
| sortaddr(Addr *a) |
| { |
| int i, j; |
| long v; |
| |
| if(a->n <= 1) |
| return; |
| |
| qsort(a->doff, a->n, sizeof(long), longcmp); |
| |
| /* remove duplicates */ |
| for(i=0, j=0; j < a->n; j++) { |
| v = a->doff[j]; |
| if(i > 0 && v == a->doff[i-1]) |
| continue; |
| a->doff[i++] = v; |
| } |
| a->n = i; |
| } |
| |
| Entry |
| getentry(int i) |
| { |
| long b, e, n; |
| static Entry ans; |
| static int anslen = 0; |
| |
| b = dot->doff[i]; |
| e = (*dict->nextoff)(b+1); |
| ans.doff = b; |
| if(e < 0) { |
| err("couldn't seek to entry"); |
| ans.start = 0; |
| ans.end = 0; |
| } else { |
| n = e-b; |
| if(n+1 > anslen) { |
| ans.start = realloc(ans.start, n+1); |
| if(!ans.start) { |
| err("out of memory"); |
| exits("nomem"); |
| } |
| anslen = n+1; |
| } |
| Bseek(bdict, b, 0); |
| n = Bread(bdict, ans.start, n); |
| ans.end = ans.start + n; |
| *ans.end = 0; |
| } |
| return ans; |
| } |
| |
| void |
| setdotnext(void) |
| { |
| long b; |
| |
| b = (*dict->nextoff)(dot->doff[dot->cur]+1); |
| if(b < 0) { |
| err("couldn't find a next entry"); |
| return; |
| } |
| dot->doff[0] = b; |
| dot->n = 1; |
| dot->cur = 0; |
| } |
| |
| void |
| setdotprev(void) |
| { |
| int tryback; |
| long here, last, p; |
| |
| if(dot->cur < 0 || dot->cur >= dot->n) |
| return; |
| tryback = 2000; |
| here = dot->doff[dot->cur]; |
| last = 0; |
| while(last == 0) { |
| p = here - tryback; |
| if(p < 0) |
| p = 0; |
| for(;;) { |
| p = (*dict->nextoff)(p+1); |
| if(p < 0) |
| return; /* shouldn't happen */ |
| if(p >= here) |
| break; |
| last = p; |
| } |
| if(!last) { |
| if(here - tryback < 0) { |
| err("can't find a previous entry"); |
| return; |
| } |
| tryback = 2*tryback; |
| } |
| } |
| dot->doff[0] = last; |
| dot->n = 1; |
| dot->cur = 0; |
| } |
| |
| /* |
| * find the specified file and return a path. |
| * default location is #9/dict, but can be |
| * in $dictdir instead. |
| */ |
| char* |
| dictfile(char *f) |
| { |
| static char *dict; |
| static int did; |
| |
| if(!did){ |
| dict = getenv("dictpath"); |
| did = 1; |
| } |
| |
| if(dict) |
| return smprint("%s/%s", dict, f); |
| return unsharp(smprint("#9/dict/%s", f)); |
| } |