blob: d9cdca624f5314c9f4a7a4647810962c9d2609c9 [file] [log] [blame]
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include <ctype.h>
#include "dict.h"
/*
* Assumed index file structure: lines of form
* [^\t]+\t[0-9]+
* First field is key, second is byte offset into dictionary.
* Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
*/
typedef struct Addr Addr;
struct Addr {
int n; /* number of offsets */
int cur; /* current position within doff array */
int maxn; /* actual current size of doff array */
ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
};
Biobuf binbuf;
Biobuf boutbuf;
Biobuf *bin = &binbuf; /* user cmd input */
Biobuf *bout = &boutbuf; /* output */
Biobuf *bdict; /* dictionary */
Biobuf *bindex; /* index file */
long indextop; /* index offset at end of file */
int lastcmd; /* last executed command */
Addr *dot; /* "current" address */
Dict *dict; /* current dictionary */
int linelen;
int breaklen = 60;
int outinhibit;
int debug;
void execcmd(int);
int getpref(char*, Rune*);
Entry getentry(int);
int getfield(Rune*);
long locate(Rune*);
int parseaddr(char*, char**);
int parsecmd(char*);
int search(char*, int);
long seeknextline(Biobuf*, long);
void setdotnext(void);
void setdotprev(void);
void sortaddr(Addr*);
void usage(void);
enum {
Plen=300, /* max length of a search pattern */
Fieldlen=200, /* max length of an index field */
Aslots=10 /* initial number of slots in an address */
};
void
main(int argc, char **argv)
{
int i, cmd, kflag;
char *line, *p;
Binit(&binbuf, 0, OREAD);
Binit(&boutbuf, 1, OWRITE);
kflag = 0;
line = 0;
dict = 0;
for(i=0; dicts[i].name; i++){
if(access(dictfile(dicts[i].path), 0)>=0 && access(dictfile(dicts[i].indexpath), 0)>=0){
dict = &dicts[i];
break;
}
}
ARGBEGIN {
case 'd':
p = ARGF();
dict = 0;
if(p) {
for(i=0; dicts[i].name; i++)
if(strcmp(p, dicts[i].name)==0) {
dict = &dicts[i];
break;
}
}
if(!dict)
usage();
break;
case 'c':
line = ARGF();
if(!line)
usage();
break;
case 'k':
kflag++;
break;
case 'D':
debug++;
break;
default:
usage();
ARGEND }
if(dict == 0){
err("no dictionaries present on this system");
exits("nodict");
}
if(kflag) {
(*dict->printkey)();
exits(0);
}
if(argc > 1)
usage();
else if(argc == 1) {
if(line)
usage();
p = argv[0];
line = malloc(strlen(p)+5);
sprint(line, "/%s/P\n", p);
}
dict->path = dictfile(dict->path);
dict->indexpath = dictfile(dict->indexpath);
bdict = Bopen(dict->path, OREAD);
if(!bdict) {
err("can't open dictionary %s", dict->path);
exits("nodict");
}
bindex = Bopen(dict->indexpath, OREAD);
if(!bindex) {
err("can't open index %s", dict->indexpath);
exits("noindex");
}
indextop = Bseek(bindex, 0L, 2);
dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
dot->n = 0;
dot->cur = 0;
dot->maxn = Aslots;
lastcmd = 0;
if(line) {
cmd = parsecmd(line);
if(cmd)
execcmd(cmd);
} else {
for(;;) {
Bprint(bout, "*");
Bflush(bout);
line = Brdline(bin, '\n');
linelen = 0;
if(!line)
break;
cmd = parsecmd(line);
if(cmd) {
execcmd(cmd);
lastcmd = cmd;
}
}
}
exits(0);
}
void
usage(void)
{
int i;
char *a, *b;
Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
for(i = 0; dicts[i].name; i++){
a = b = "";
if(access(dictfile(dicts[i].path), 0)<0 || access(dictfile(dicts[i].indexpath), 0)<0){
a = "[";
b = "]";
}
Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
}
exits("usage");
}
int
parsecmd(char *line)
{
char *e;
int cmd, ans;
if(parseaddr(line, &e) >= 0)
line = e;
else
return 0;
cmd = *line;
ans = cmd;
if(isupper(cmd))
cmd = tolower(cmd);
if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
cmd == '\n')) {
err("unknown command %c", cmd);
return 0;
}
if(cmd == '\n')
switch(lastcmd) {
case 0: ans = 'H'; break;
case 'H': ans = 'p'; break;
default : ans = lastcmd; break;
}
else if(line[1] != '\n' && line[1] != 0)
err("extra stuff after command %c ignored", cmd);
return ans;
}
void
execcmd(int cmd)
{
Entry e;
int cur, doall;
if(isupper(cmd)) {
doall = 1;
cmd = tolower(cmd);
cur = 0;
} else {
doall = 0;
cur = dot->cur;
}
if(debug && doall && cmd == 'a')
Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
for(;;){
if(cur >= dot->n)
break;
if(doall) {
Bprint(bout, "%d\t", cur+1);
linelen += 4 + (cur >= 10);
}
switch(cmd) {
case 'a':
Bprint(bout, "#%lud\n", dot->doff[cur]);
break;
case 'h':
case 'p':
case 'r':
e = getentry(cur);
(*dict->printentry)(e, cmd);
break;
}
cur++;
if(doall) {
if(cmd == 'p' || cmd == 'r') {
Bputc(bout, '\n');
linelen = 0;
}
} else
break;
}
if(cur >= dot->n)
cur = 0;
dot->cur = cur;
}
/*
* Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
* Answer goes in dot.
* Return -1 if address starts, but get error.
* Return 0 if no address.
*/
int
parseaddr(char *line, char **eptr)
{
int delim, plen;
ulong v;
char *e;
char pat[Plen];
if(*line == '/' || *line == '!') {
/* anchored regular expression match; '!' means no folding */
if(*line == '/') {
delim = '/';
e = strpbrk(line+1, "/\n");
} else {
delim = '!';
e = strpbrk(line+1, "!\n");
}
plen = e-line-1;
if(plen >= Plen-3) {
err("pattern too big");
return -1;
}
pat[0] = '^';
memcpy(pat+1, line+1, plen);
pat[plen+1] = '$';
pat[plen+2] = 0;
if(*e == '\n')
line = e;
else
line = e+1;
if(!search(pat, delim == '/')) {
err("pattern not found");
return -1;
}
} else if(*line == '#') {
/* absolute byte offset into dictionary */
line++;
if(!isdigit((uchar)*line))
return -1;
v = strtoul(line, &e, 10);
line = e;
dot->doff[0] = v;
dot->n = 1;
dot->cur = 0;
} else if(isdigit((uchar)*line)) {
v = strtoul(line, &e, 10);
line = e;
if(v < 1 || v > dot->n)
err(".%d not in range [1,%d], ignored",
v, dot->n);
else
dot->cur = v-1;
} else if(*line == '.') {
line++;
} else {
*eptr = line;
return 0;
}
while(*line == '+' || *line == '-') {
if(*line == '+')
setdotnext();
else
setdotprev();
line++;
}
*eptr = line;
return 1;
}
/*
* Index file is sorted by folded field1.
* Method: find pre, a folded prefix of r.e. pat,
* and then low = offset to beginning of
* line in index file where first match of prefix occurs.
* Then go through index until prefix no longer matches,
* adding each line that matches real pattern to dot.
* Finally, sort dot offsets (uniquing).
* We know pat len < Plen, and that it is surrounded by ^..$
*/
int
search(char *pat, int dofold)
{
int needre, prelen, match, n;
Reprog *re;
long ioff, v;
Rune pre[Plen];
Rune lit[Plen];
Rune entry[Fieldlen];
char fpat[Plen];
prelen = getpref(pat+1, pre);
if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
runescpy(lit, pre);
if(dofold)
fold(lit);
needre = 0;
SET(re);
} else {
needre = 1;
if(dofold) {
foldre(fpat, pat);
re = regcomp(fpat);
} else
re = regcomp(pat);
}
fold(pre);
ioff = locate(pre);
if(ioff < 0)
return 0;
dot->n = 0;
Bseek(bindex, ioff, 0);
for(;;) {
if(!getfield(entry))
break;
if(dofold)
fold(entry);
if(needre)
match = rregexec(re, entry, 0, 0);
else
match = (acomp(lit, entry) == 0);
if(match) {
if(!getfield(entry))
break;
v = runetol(entry);
if(dot->n >= dot->maxn) {
n = 2*dot->maxn;
dot = realloc(dot,
sizeof(Addr)+(n-1)*sizeof(long));
if(!dot) {
err("out of memory");
exits("nomem");
}
dot->maxn = n;
}
dot->doff[dot->n++] = v;
} else {
if(!dofold)
fold(entry);
if(*pre) {
n = acomp(pre, entry);
if(n < -1 || (!needre && n < 0))
break;
}
/* get to next index entry */
if(!getfield(entry))
break;
}
}
sortaddr(dot);
dot->cur = 0;
return dot->n;
}
/*
* Return offset in index file of first line whose folded
* first field has pre as a prefix. -1 if none found.
*/
long
locate(Rune *pre)
{
long top, bot, mid;
Rune entry[Fieldlen];
if(*pre == 0)
return 0;
bot = 0;
top = indextop;
if(debug>1)
fprint(2, "locate looking for prefix %S\n", pre);
for(;;) {
/*
* Loop invariant: foldkey(bot) < pre <= foldkey(top)
* and bot < top, and bot,top point at beginning of lines
*/
mid = (top+bot) / 2;
mid = seeknextline(bindex, mid);
if(debug > 1)
fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
bot, (top+bot) / 2, mid, top);
if(mid == top || !getfield(entry))
break;
if(debug > 1)
fprint(2, "key=%S\n", entry);
/*
* here mid is strictly between bot and top
*/
fold(entry);
if(acomp(pre, entry) <= 0)
top = mid;
else
bot = mid;
}
/*
* bot < top, but they don't necessarily point at successive lines
* Use linear search from bot to find first line that pre is a
* prefix of
*/
while((bot = seeknextline(bindex, bot)) <= top) {
if(!getfield(entry))
return -1;
if(debug > 1)
fprint(2, "key=%S\n", entry);
fold(entry);
switch(acomp(pre, entry)) {
case -2:
return -1;
case -1:
case 0:
return bot;
case 1:
case 2:
continue;
}
}
return -1;
}
/*
* Get prefix of non re-metacharacters, runified, into pre,
* and return length
*/
int
getpref(char *pat, Rune *pre)
{
int n, r;
char *p;
p = pat;
while(*p) {
n = chartorune(pre, p);
r = *pre;
switch(r) {
case 0x2e: case 0x2a: case 0x2b: case 0x3f:
case 0x5b: case 0x5d: case 0x28: case ')':
case 0x7c: case 0x5e: case 0x24:
*pre = 0;
return p-pat;
case '\\':
p += n;
p += chartorune(++pre, p);
pre++;
break;
default:
p += n;
pre++;
}
}
return p-pat;
}
long
seeknextline(Biobuf *b, long off)
{
long c;
Bseek(b, off, 0);
do {
c = Bgetrune(b);
} while(c>=0 && c!='\n');
return Boffset(b);
}
/*
* Get next field out of index file (either tab- or nl- terminated)
* Answer in *rp, assumed to be Fieldlen long.
* Return 0 if read error first.
*/
int
getfield(Rune *rp)
{
long c;
int n;
for(n=Fieldlen; n-- > 0; ) {
if ((c = Bgetrune(bindex)) < 0)
return 0;
if(c == '\t' || c == '\n') {
*rp = '\0';
return 1;
}
*rp++ = c;
}
err("word too long");
return 0;
}
/*
* A compare longs function suitable for qsort
*/
static int
longcmp(const void *av, const void *bv)
{
long v;
long *a, *b;
a = (long*)av;
b = (long*)bv;
v = *a - *b;
if(v < 0)
return -1;
else if(v == 0)
return 0;
else
return 1;
}
void
sortaddr(Addr *a)
{
int i, j;
long v;
if(a->n <= 1)
return;
qsort(a->doff, a->n, sizeof(long), longcmp);
/* remove duplicates */
for(i=0, j=0; j < a->n; j++) {
v = a->doff[j];
if(i > 0 && v == a->doff[i-1])
continue;
a->doff[i++] = v;
}
a->n = i;
}
Entry
getentry(int i)
{
long b, e, n;
static Entry ans;
static int anslen = 0;
b = dot->doff[i];
e = (*dict->nextoff)(b+1);
ans.doff = b;
if(e < 0) {
err("couldn't seek to entry");
ans.start = 0;
ans.end = 0;
} else {
n = e-b;
if(n+1 > anslen) {
ans.start = realloc(ans.start, n+1);
if(!ans.start) {
err("out of memory");
exits("nomem");
}
anslen = n+1;
}
Bseek(bdict, b, 0);
n = Bread(bdict, ans.start, n);
ans.end = ans.start + n;
*ans.end = 0;
}
return ans;
}
void
setdotnext(void)
{
long b;
b = (*dict->nextoff)(dot->doff[dot->cur]+1);
if(b < 0) {
err("couldn't find a next entry");
return;
}
dot->doff[0] = b;
dot->n = 1;
dot->cur = 0;
}
void
setdotprev(void)
{
int tryback;
long here, last, p;
if(dot->cur < 0 || dot->cur >= dot->n)
return;
tryback = 2000;
here = dot->doff[dot->cur];
last = 0;
while(last == 0) {
p = here - tryback;
if(p < 0)
p = 0;
for(;;) {
p = (*dict->nextoff)(p+1);
if(p < 0)
return; /* shouldn't happen */
if(p >= here)
break;
last = p;
}
if(!last) {
if(here - tryback < 0) {
err("can't find a previous entry");
return;
}
tryback = 2*tryback;
}
}
dot->doff[0] = last;
dot->n = 1;
dot->cur = 0;
}
/*
* find the specified file and return a path.
* default location is #9/dict, but can be
* in $dictdir instead.
*/
char*
dictfile(char *f)
{
static char *dict;
static int did;
if(!did){
dict = getenv("dictpath");
did = 1;
}
if(dict)
return smprint("%s/%s", dict, f);
return unsharp(smprint("#9/dict/%s", f));
}