| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| |
| /* |
| bugs: |
| 00/ff for end of file can conflict with 00/ff characters |
| */ |
| |
| enum |
| { |
| Nline = 500000, /* default max number of lines saved in memory */ |
| Nmerge = 10, /* max number of temporary files merged */ |
| Nfield = 20, /* max number of argument fields */ |
| |
| Bflag = 1<<0, /* flags per field */ |
| B1flag = 1<<1, |
| |
| Dflag = 1<<2, |
| Fflag = 1<<3, |
| Gflag = 1<<4, |
| Iflag = 1<<5, |
| Mflag = 1<<6, |
| Nflag = 1<<7, |
| Rflag = 1<<8, |
| Wflag = 1<<9, |
| |
| NSstart = 0, /* states for number to key decoding */ |
| NSsign, |
| NSzero, |
| NSdigit, |
| NSpoint, |
| NSfract, |
| NSzerofract, |
| NSexp, |
| NSexpsign, |
| NSexpdigit |
| }; |
| |
| typedef struct Line Line; |
| typedef struct Key Key; |
| typedef struct Merge Merge; |
| typedef struct Field Field; |
| |
| struct Line |
| { |
| Key* key; |
| int llen; /* always >= 1 */ |
| uchar line[1]; /* always ends in '\n' */ |
| }; |
| |
| struct Merge |
| { |
| Key* key; /* copy of line->key so (Line*) looks like (Merge*) */ |
| Line* line; /* line at the head of a merged temp file */ |
| int fd; /* file descriptor */ |
| Biobuf b; /* iobuf for reading a temp file */ |
| }; |
| |
| struct Key |
| { |
| int klen; |
| uchar key[1]; |
| }; |
| |
| struct Field |
| { |
| int beg1; |
| int beg2; |
| int end1; |
| int end2; |
| |
| long flags; |
| uchar mapto[256]; |
| |
| void (*dokey)(Key*, uchar*, uchar*, Field*); |
| }; |
| |
| struct args |
| { |
| char* ofile; |
| char* tname; |
| Rune tabchar; |
| char cflag; |
| char uflag; |
| char vflag; |
| int nfield; |
| int nfile; |
| Field field[Nfield]; |
| |
| Line** linep; |
| long nline; /* number of lines in this temp file */ |
| long lineno; /* overall ordinal for -s option */ |
| int ntemp; |
| long mline; /* max lines per file */ |
| } args; |
| |
| extern int latinmap[]; |
| extern Rune* month[12]; |
| |
| void buildkey(Line*); |
| void doargs(int, char*[]); |
| void dofield(char*, int*, int*, int, int); |
| void dofile(Biobuf*); |
| void dokey_(Key*, uchar*, uchar*, Field*); |
| void dokey_dfi(Key*, uchar*, uchar*, Field*); |
| void dokey_gn(Key*, uchar*, uchar*, Field*); |
| void dokey_m(Key*, uchar*, uchar*, Field*); |
| void dokey_r(Key*, uchar*, uchar*, Field*); |
| void done(char*); |
| int kcmp(Key*, Key*); |
| void makemapd(Field*); |
| void makemapm(Field*); |
| void mergefiles(int, int, Biobuf*); |
| void mergeout(Biobuf*); |
| void newfield(void); |
| Line* newline(Biobuf*); |
| void nomem(void); |
| void notifyf(void*, char*); |
| void printargs(void); |
| void printout(Biobuf*); |
| void setfield(int, int); |
| uchar* skip(uchar*, int, int, int, int); |
| void sort4(void*, ulong); |
| char* tempfile(int); |
| void tempout(void); |
| void lineout(Biobuf*, Line*); |
| |
| void |
| main(int argc, char *argv[]) |
| { |
| int i, f; |
| char *s; |
| Biobuf bbuf; |
| |
| notify(notifyf); /**/ |
| doargs(argc, argv); |
| if(args.vflag) |
| printargs(); |
| |
| for(i=1; i<argc; i++) { |
| s = argv[i]; |
| if(s == 0) |
| continue; |
| if(strcmp(s, "-") == 0) { |
| Binit(&bbuf, 0, OREAD); |
| dofile(&bbuf); |
| Bterm(&bbuf); |
| continue; |
| } |
| f = open(s, OREAD); |
| if(f < 0) { |
| fprint(2, "sort: open %s: %r\n", s); |
| done("open"); |
| } |
| Binit(&bbuf, f, OREAD); |
| dofile(&bbuf); |
| Bterm(&bbuf); |
| close(f); |
| } |
| if(args.nfile == 0) { |
| Binit(&bbuf, 0, OREAD); |
| dofile(&bbuf); |
| Bterm(&bbuf); |
| } |
| if(args.cflag) |
| done(0); |
| if(args.vflag) |
| fprint(2, "=========\n"); |
| |
| f = 1; |
| if(args.ofile) { |
| f = create(args.ofile, OWRITE, 0666); |
| if(f < 0) { |
| fprint(2, "sort: create %s: %r\n", args.ofile); |
| done("create"); |
| } |
| } |
| |
| Binit(&bbuf, f, OWRITE); |
| if(args.ntemp) { |
| tempout(); |
| mergeout(&bbuf); |
| } else { |
| printout(&bbuf); |
| } |
| Bterm(&bbuf); |
| done(0); |
| } |
| |
| void |
| dofile(Biobuf *b) |
| { |
| Line *l, *ol; |
| int n; |
| |
| if(args.cflag) { |
| ol = newline(b); |
| if(ol == 0) |
| return; |
| for(;;) { |
| l = newline(b); |
| if(l == 0) |
| break; |
| n = kcmp(ol->key, l->key); |
| if(n > 0 || (n == 0 && args.uflag)) { |
| fprint(2, "sort: -c file not in sort\n"); /**/ |
| done("order"); |
| } |
| free(ol->key); |
| free(ol); |
| ol = l; |
| } |
| return; |
| } |
| |
| if(args.linep == 0) { |
| args.linep = malloc(args.mline * sizeof(args.linep)); |
| if(args.linep == 0) |
| nomem(); |
| } |
| for(;;) { |
| l = newline(b); |
| if(l == 0) |
| break; |
| if(args.nline >= args.mline) |
| tempout(); |
| args.linep[args.nline] = l; |
| args.nline++; |
| args.lineno++; |
| } |
| } |
| |
| void |
| notifyf(void *a, char *s) |
| { |
| USED(a); |
| if(strcmp(s, "interrupt") == 0) |
| done(0); |
| if(strcmp(s, "hangup") == 0) |
| done(0); |
| if(strcmp(s, "kill") == 0) |
| done(0); |
| if(strncmp(s, "sys: write on closed pipe", 25) == 0) |
| done(0); |
| noted(NDFLT); |
| } |
| |
| Line* |
| newline(Biobuf *b) |
| { |
| Line *l; |
| char *p; |
| int n, c; |
| |
| p = Brdline(b, '\n'); |
| n = Blinelen(b); |
| if(p == 0) { |
| if(n == 0) |
| return 0; |
| l = 0; |
| for(n=0;;) { |
| if((n & 31) == 0) { |
| l = realloc(l, sizeof(Line) + |
| (n+31)*sizeof(l->line[0])); |
| if(l == 0) |
| nomem(); |
| } |
| c = Bgetc(b); |
| if(c < 0) { |
| fprint(2, "sort: newline added\n"); |
| c = '\n'; |
| } |
| l->line[n++] = c; |
| if(c == '\n') |
| break; |
| } |
| l->llen = n; |
| buildkey(l); |
| return l; |
| } |
| l = malloc(sizeof(Line) + |
| (n-1)*sizeof(l->line[0])); |
| if(l == 0) |
| nomem(); |
| l->llen = n; |
| memmove(l->line, p, n); |
| buildkey(l); |
| return l; |
| } |
| |
| void |
| lineout(Biobuf *b, Line *l) |
| { |
| int n, m; |
| |
| n = l->llen; |
| m = Bwrite(b, l->line, n); |
| if(n != m) |
| exits("write"); |
| } |
| |
| void |
| tempout(void) |
| { |
| long n; |
| Line **lp, *l; |
| char *tf; |
| int f; |
| Biobuf tb; |
| |
| sort4(args.linep, args.nline); |
| tf = tempfile(args.ntemp); |
| args.ntemp++; |
| f = create(tf, OWRITE, 0666); |
| if(f < 0) { |
| fprint(2, "sort: create %s: %r\n", tf); |
| done("create"); |
| } |
| |
| Binit(&tb, f, OWRITE); |
| lp = args.linep; |
| for(n=args.nline; n>0; n--) { |
| l = *lp++; |
| lineout(&tb, l); |
| free(l->key); |
| free(l); |
| } |
| args.nline = 0; |
| Bterm(&tb); |
| close(f); |
| } |
| |
| void |
| done(char *xs) |
| { |
| int i; |
| |
| for(i=0; i<args.ntemp; i++) |
| remove(tempfile(i)); |
| exits(xs); |
| } |
| |
| void |
| nomem(void) |
| { |
| fprint(2, "sort: out of memory\n"); |
| done("mem"); |
| } |
| |
| char* |
| tempfile(int n) |
| { |
| static char file[100]; |
| static uint pid; |
| char *dir; |
| |
| dir = "/var/tmp"; |
| if(args.tname) |
| dir = args.tname; |
| if(strlen(dir) >= nelem(file)-20) { |
| fprint(2, "temp file directory name is too long: %s\n", dir); |
| done("tdir"); |
| } |
| |
| if(pid == 0) { |
| pid = getpid(); |
| if(pid == 0) { |
| pid = time(0); |
| if(pid == 0) |
| pid = 1; |
| } |
| } |
| |
| sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n); |
| return file; |
| } |
| |
| void |
| mergeout(Biobuf *b) |
| { |
| int n, i, f; |
| char *tf; |
| Biobuf tb; |
| |
| for(i=0; i<args.ntemp; i+=n) { |
| n = args.ntemp - i; |
| if(n > Nmerge) { |
| tf = tempfile(args.ntemp); |
| args.ntemp++; |
| f = create(tf, OWRITE, 0666); |
| if(f < 0) { |
| fprint(2, "sort: create %s: %r\n", tf); |
| done("create"); |
| } |
| Binit(&tb, f, OWRITE); |
| |
| n = Nmerge; |
| mergefiles(i, n, &tb); |
| |
| Bterm(&tb); |
| close(f); |
| } else |
| mergefiles(i, n, b); |
| } |
| } |
| |
| void |
| mergefiles(int t, int n, Biobuf *b) |
| { |
| Merge *m, *mp, **mmp; |
| Key *ok; |
| Line *l; |
| char *tf; |
| int i, f, nn; |
| |
| mmp = malloc(n*sizeof(*mmp)); |
| mp = malloc(n*sizeof(*mp)); |
| if(mmp == 0 || mp == 0) |
| nomem(); |
| |
| nn = 0; |
| m = mp; |
| for(i=0; i<n; i++,m++) { |
| tf = tempfile(t+i); |
| f = open(tf, OREAD); |
| if(f < 0) { |
| fprint(2, "sort: reopen %s: %r\n", tf); |
| done("open"); |
| } |
| m->fd = f; |
| Binit(&m->b, f, OREAD); |
| mmp[nn] = m; |
| |
| l = newline(&m->b); |
| if(l == 0) |
| continue; |
| nn++; |
| m->line = l; |
| m->key = l->key; |
| } |
| |
| ok = 0; |
| for(;;) { |
| sort4(mmp, nn); |
| m = *mmp; |
| if(nn == 0) |
| break; |
| for(;;) { |
| l = m->line; |
| if(args.uflag && ok && kcmp(ok, l->key) == 0) { |
| free(l->key); |
| free(l); |
| } else { |
| lineout(b, l); |
| if(ok) |
| free(ok); |
| ok = l->key; |
| free(l); |
| } |
| |
| l = newline(&m->b); |
| if(l == 0) { |
| nn--; |
| mmp[0] = mmp[nn]; |
| break; |
| } |
| m->line = l; |
| m->key = l->key; |
| if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0) |
| break; |
| } |
| } |
| if(ok) |
| free(ok); |
| |
| m = mp; |
| for(i=0; i<n; i++,m++) { |
| Bterm(&m->b); |
| close(m->fd); |
| } |
| |
| free(mp); |
| free(mmp); |
| } |
| |
| int |
| kcmp(Key *ka, Key *kb) |
| { |
| int n, m; |
| |
| /* |
| * set n to length of smaller key |
| */ |
| n = ka->klen; |
| m = kb->klen; |
| if(n > m) |
| n = m; |
| return memcmp(ka->key, kb->key, n); |
| } |
| |
| void |
| printout(Biobuf *b) |
| { |
| long n; |
| Line **lp, *l; |
| Key *ok; |
| |
| sort4(args.linep, args.nline); |
| lp = args.linep; |
| ok = 0; |
| for(n=args.nline; n>0; n--) { |
| l = *lp++; |
| if(args.uflag && ok && kcmp(ok, l->key) == 0) |
| continue; |
| lineout(b, l); |
| ok = l->key; |
| } |
| } |
| |
| void |
| setfield(int n, int c) |
| { |
| Field *f; |
| |
| f = &args.field[n]; |
| switch(c) { |
| default: |
| fprint(2, "sort: unknown option: field.%C\n", c); |
| done("option"); |
| case 'b': /* skip blanks */ |
| f->flags |= Bflag; |
| break; |
| case 'd': /* directory order */ |
| f->flags |= Dflag; |
| break; |
| case 'f': /* fold case */ |
| f->flags |= Fflag; |
| break; |
| case 'g': /* floating point -n case */ |
| f->flags |= Gflag; |
| break; |
| case 'i': /* ignore non-ascii */ |
| f->flags |= Iflag; |
| break; |
| case 'M': /* month */ |
| f->flags |= Mflag; |
| break; |
| case 'n': /* numbers */ |
| f->flags |= Nflag; |
| break; |
| case 'r': /* reverse */ |
| f->flags |= Rflag; |
| break; |
| case 'w': /* ignore white */ |
| f->flags |= Wflag; |
| break; |
| } |
| } |
| |
| void |
| dofield(char *s, int *n1, int *n2, int off1, int off2) |
| { |
| int c, n; |
| |
| c = *s++; |
| if(c >= '0' && c <= '9') { |
| n = 0; |
| while(c >= '0' && c <= '9') { |
| n = n*10 + (c-'0'); |
| c = *s++; |
| } |
| n -= off1; /* posix committee: rot in hell */ |
| if(n < 0) { |
| fprint(2, "sort: field offset must be positive\n"); |
| done("option"); |
| } |
| *n1 = n; |
| } |
| if(c == '.') { |
| c = *s++; |
| if(c >= '0' && c <= '9') { |
| n = 0; |
| while(c >= '0' && c <= '9') { |
| n = n*10 + (c-'0'); |
| c = *s++; |
| } |
| n -= off2; |
| if(n < 0) { |
| fprint(2, "sort: character offset must be positive\n"); |
| done("option"); |
| } |
| *n2 = n; |
| } |
| } |
| while(c != 0) { |
| setfield(args.nfield, c); |
| c = *s++; |
| } |
| } |
| |
| void |
| printargs(void) |
| { |
| int i, n; |
| Field *f; |
| char *prefix; |
| |
| fprint(2, "sort"); |
| for(i=0; i<=args.nfield; i++) { |
| f = &args.field[i]; |
| prefix = " -"; |
| if(i) { |
| n = f->beg1; |
| if(n >= 0) |
| fprint(2, " +%d", n); |
| else |
| fprint(2, " +*"); |
| n = f->beg2; |
| if(n >= 0) |
| fprint(2, ".%d", n); |
| else |
| fprint(2, ".*"); |
| |
| if(f->flags & B1flag) |
| fprint(2, "b"); |
| |
| n = f->end1; |
| if(n >= 0) |
| fprint(2, " -%d", n); |
| else |
| fprint(2, " -*"); |
| n = f->end2; |
| if(n >= 0) |
| fprint(2, ".%d", n); |
| else |
| fprint(2, ".*"); |
| prefix = ""; |
| } |
| if(f->flags & Bflag) |
| fprint(2, "%sb", prefix); |
| if(f->flags & Dflag) |
| fprint(2, "%sd", prefix); |
| if(f->flags & Fflag) |
| fprint(2, "%sf", prefix); |
| if(f->flags & Gflag) |
| fprint(2, "%sg", prefix); |
| if(f->flags & Iflag) |
| fprint(2, "%si", prefix); |
| if(f->flags & Mflag) |
| fprint(2, "%sM", prefix); |
| if(f->flags & Nflag) |
| fprint(2, "%sn", prefix); |
| if(f->flags & Rflag) |
| fprint(2, "%sr", prefix); |
| if(f->flags & Wflag) |
| fprint(2, "%sw", prefix); |
| } |
| if(args.cflag) |
| fprint(2, " -c"); |
| if(args.uflag) |
| fprint(2, " -u"); |
| if(args.ofile) |
| fprint(2, " -o %s", args.ofile); |
| if(args.mline != Nline) |
| fprint(2, " -l %ld", args.mline); |
| fprint(2, "\n"); |
| } |
| |
| void |
| newfield(void) |
| { |
| int n; |
| Field *f; |
| |
| n = args.nfield + 1; |
| if(n >= Nfield) { |
| fprint(2, "sort: too many fields specified\n"); |
| done("option"); |
| } |
| args.nfield = n; |
| f = &args.field[n]; |
| f->beg1 = -1; |
| f->beg2 = -1; |
| f->end1 = -1; |
| f->end2 = -1; |
| } |
| |
| void |
| doargs(int argc, char *argv[]) |
| { |
| int i, c, hadplus; |
| char *s, *p, *q; |
| Field *f; |
| |
| hadplus = 0; |
| args.mline = Nline; |
| for(i=1; i<argc; i++) { |
| s = argv[i]; |
| c = *s++; |
| if(c == '-') { |
| c = *s; |
| if(c == 0) /* forced end of arg marker */ |
| break; |
| argv[i] = 0; /* clobber args processed */ |
| if(c == '.' || (c >= '0' && c <= '9')) { |
| if(!hadplus) |
| newfield(); |
| f = &args.field[args.nfield]; |
| dofield(s, &f->end1, &f->end2, 0, 0); |
| hadplus = 0; |
| continue; |
| } |
| |
| while(c = *s++) |
| switch(c) { |
| case '-': /* end of options */ |
| i = argc; |
| continue; |
| case 'T': /* temp directory */ |
| if(*s == 0) { |
| i++; |
| if(i < argc) { |
| args.tname = argv[i]; |
| argv[i] = 0; |
| } |
| } else |
| args.tname = s; |
| s = strchr(s, 0); |
| break; |
| case 'o': /* output file */ |
| if(*s == 0) { |
| i++; |
| if(i < argc) { |
| args.ofile = argv[i]; |
| argv[i] = 0; |
| } |
| } else |
| args.ofile = s; |
| s = strchr(s, 0); |
| break; |
| case 'k': /* posix key (what were they thinking?) */ |
| p = 0; |
| if(*s == 0) { |
| i++; |
| if(i < argc) { |
| p = argv[i]; |
| argv[i] = 0; |
| } |
| } else |
| p = s; |
| s = strchr(s, 0); |
| if(p == 0) |
| break; |
| |
| newfield(); |
| q = strchr(p, ','); |
| if(q) |
| *q++ = 0; |
| f = &args.field[args.nfield]; |
| dofield(p, &f->beg1, &f->beg2, 1, 1); |
| if(f->flags & Bflag) { |
| f->flags |= B1flag; |
| f->flags &= ~Bflag; |
| } |
| if(q) { |
| dofield(q, &f->end1, &f->end2, 1, 0); |
| if(f->end2 <= 0) |
| f->end1++; |
| } |
| hadplus = 0; |
| break; |
| case 't': /* tab character */ |
| if(*s == 0) { |
| i++; |
| if(i < argc) { |
| chartorune(&args.tabchar, argv[i]); |
| argv[i] = 0; |
| } |
| } else |
| s += chartorune(&args.tabchar, s); |
| if(args.tabchar == '\n') { |
| fprint(2, "aw come on, rob\n"); |
| done("rob"); |
| } |
| break; |
| case 'c': /* check order */ |
| args.cflag = 1; |
| break; |
| case 'u': /* unique */ |
| args.uflag = 1; |
| break; |
| case 'v': /* debugging noise */ |
| args.vflag = 1; |
| break; |
| case 'l': |
| if(*s == 0) { |
| i++; |
| if(i < argc) { |
| args.mline = atol(argv[i]); |
| argv[i] = 0; |
| } |
| } else |
| args.mline = atol(s); |
| s = strchr(s, 0); |
| break; |
| |
| case 'M': /* month */ |
| case 'b': /* skip blanks */ |
| case 'd': /* directory order */ |
| case 'f': /* fold case */ |
| case 'g': /* floating numbers */ |
| case 'i': /* ignore non-ascii */ |
| case 'n': /* numbers */ |
| case 'r': /* reverse */ |
| case 'w': /* ignore white */ |
| if(args.nfield > 0) |
| fprint(2, "sort: global field set after -k\n"); |
| setfield(0, c); |
| break; |
| case 'm': |
| /* option m silently ignored but required by posix */ |
| break; |
| default: |
| fprint(2, "sort: unknown option: -%C\n", c); |
| done("option"); |
| } |
| continue; |
| } |
| if(c == '+') { |
| argv[i] = 0; /* clobber args processed */ |
| c = *s; |
| if(c == '.' || (c >= '0' && c <= '9')) { |
| newfield(); |
| f = &args.field[args.nfield]; |
| dofield(s, &f->beg1, &f->beg2, 0, 0); |
| if(f->flags & Bflag) { |
| f->flags |= B1flag; |
| f->flags &= ~Bflag; |
| } |
| hadplus = 1; |
| continue; |
| } |
| fprint(2, "sort: unknown option: +%C\n", c); |
| done("option"); |
| } |
| args.nfile++; |
| } |
| |
| for(i=0; i<=args.nfield; i++) { |
| f = &args.field[i]; |
| |
| /* |
| * global options apply to fields that |
| * specify no options |
| */ |
| if(f->flags == 0) { |
| f->flags = args.field[0].flags; |
| if(args.field[0].flags & Bflag) |
| f->flags |= B1flag; |
| } |
| |
| |
| /* |
| * build buildkey specification |
| */ |
| switch(f->flags & ~(Bflag|B1flag)) { |
| default: |
| fprint(2, "sort: illegal combination of flags: %lx\n", f->flags); |
| done("option"); |
| case 0: |
| f->dokey = dokey_; |
| break; |
| case Rflag: |
| f->dokey = dokey_r; |
| break; |
| case Gflag: |
| case Nflag: |
| case Gflag|Nflag: |
| case Gflag|Rflag: |
| case Nflag|Rflag: |
| case Gflag|Nflag|Rflag: |
| f->dokey = dokey_gn; |
| break; |
| case Mflag: |
| case Mflag|Rflag: |
| f->dokey = dokey_m; |
| makemapm(f); |
| break; |
| case Dflag: |
| case Dflag|Fflag: |
| case Dflag|Fflag|Iflag: |
| case Dflag|Fflag|Iflag|Rflag: |
| case Dflag|Fflag|Iflag|Rflag|Wflag: |
| case Dflag|Fflag|Iflag|Wflag: |
| case Dflag|Fflag|Rflag: |
| case Dflag|Fflag|Rflag|Wflag: |
| case Dflag|Fflag|Wflag: |
| case Dflag|Iflag: |
| case Dflag|Iflag|Rflag: |
| case Dflag|Iflag|Rflag|Wflag: |
| case Dflag|Iflag|Wflag: |
| case Dflag|Rflag: |
| case Dflag|Rflag|Wflag: |
| case Dflag|Wflag: |
| case Fflag: |
| case Fflag|Iflag: |
| case Fflag|Iflag|Rflag: |
| case Fflag|Iflag|Rflag|Wflag: |
| case Fflag|Iflag|Wflag: |
| case Fflag|Rflag: |
| case Fflag|Rflag|Wflag: |
| case Fflag|Wflag: |
| case Iflag: |
| case Iflag|Rflag: |
| case Iflag|Rflag|Wflag: |
| case Iflag|Wflag: |
| case Wflag: |
| f->dokey = dokey_dfi; |
| makemapd(f); |
| break; |
| } |
| } |
| |
| /* |
| * random spot checks |
| */ |
| if(args.nfile > 1 && args.cflag) { |
| fprint(2, "sort: -c can have at most one input file\n"); |
| done("option"); |
| } |
| return; |
| } |
| |
| uchar* |
| skip(uchar *l, int n1, int n2, int bflag, int endfield) |
| { |
| int i, c, tc; |
| Rune r; |
| |
| if(endfield && n1 < 0) |
| return 0; |
| |
| c = *l++; |
| tc = args.tabchar; |
| if(tc) { |
| if(tc < Runeself) { |
| for(i=n1; i>0; i--) { |
| while(c != tc) { |
| if(c == '\n') |
| return 0; |
| c = *l++; |
| } |
| if(!(endfield && i == 1)) |
| c = *l++; |
| } |
| } else { |
| l--; |
| l += chartorune(&r, (char*)l); |
| for(i=n1; i>0; i--) { |
| while(r != tc) { |
| if(r == '\n') |
| return 0; |
| l += chartorune(&r, (char*)l); |
| } |
| if(!(endfield && i == 1)) |
| l += chartorune(&r, (char*)l); |
| } |
| c = r; |
| } |
| } else { |
| for(i=n1; i>0; i--) { |
| while(c == ' ' || c == '\t') |
| c = *l++; |
| while(c != ' ' && c != '\t') { |
| if(c == '\n') |
| return 0; |
| c = *l++; |
| } |
| } |
| } |
| |
| if(bflag) |
| while(c == ' ' || c == '\t') |
| c = *l++; |
| |
| l--; |
| for(i=n2; i>0; i--) { |
| c = *l; |
| if(c < Runeself) { |
| if(c == '\n') |
| return 0; |
| l++; |
| continue; |
| } |
| l += chartorune(&r, (char*)l); |
| } |
| return l; |
| } |
| |
| void |
| dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f) |
| { |
| uchar *kp; |
| int c, cl, dp; |
| int state, nzero, exp, expsign, rflag; |
| |
| cl = k->klen + 3; |
| kp = k->key + cl; /* skip place for sign, exponent[2] */ |
| |
| nzero = 0; /* number of trailing zeros */ |
| exp = 0; /* value of the exponent */ |
| expsign = 0; /* sign of the exponent */ |
| dp = 0x4040; /* location of decimal point */ |
| rflag = f->flags&Rflag; /* xor of rflag and - sign */ |
| state = NSstart; |
| |
| for(;; lp++) { |
| if(lp >= lpe) |
| break; |
| c = *lp; |
| |
| if(c == ' ' || c == '\t') { |
| switch(state) { |
| case NSstart: |
| case NSsign: |
| continue; |
| } |
| break; |
| } |
| if(c == '+' || c == '-') { |
| switch(state) { |
| case NSstart: |
| state = NSsign; |
| if(c == '-') |
| rflag = !rflag; |
| continue; |
| case NSexp: |
| state = NSexpsign; |
| if(c == '-') |
| expsign = 1; |
| continue; |
| } |
| break; |
| } |
| if(c == '0') { |
| switch(state) { |
| case NSdigit: |
| if(rflag) |
| c = ~c; |
| *kp++ = c; |
| cl++; |
| nzero++; |
| dp++; |
| state = NSdigit; |
| continue; |
| case NSfract: |
| if(rflag) |
| c = ~c; |
| *kp++ = c; |
| cl++; |
| nzero++; |
| state = NSfract; |
| continue; |
| case NSstart: |
| case NSsign: |
| case NSzero: |
| state = NSzero; |
| continue; |
| case NSzerofract: |
| case NSpoint: |
| dp--; |
| state = NSzerofract; |
| continue; |
| case NSexpsign: |
| case NSexp: |
| case NSexpdigit: |
| exp = exp*10 + (c - '0'); |
| state = NSexpdigit; |
| continue; |
| } |
| break; |
| } |
| if(c >= '1' && c <= '9') { |
| switch(state) { |
| case NSzero: |
| case NSstart: |
| case NSsign: |
| case NSdigit: |
| if(rflag) |
| c = ~c; |
| *kp++ = c; |
| cl++; |
| nzero = 0; |
| dp++; |
| state = NSdigit; |
| continue; |
| case NSzerofract: |
| case NSpoint: |
| case NSfract: |
| if(rflag) |
| c = ~c; |
| *kp++ = c; |
| cl++; |
| nzero = 0; |
| state = NSfract; |
| continue; |
| case NSexpsign: |
| case NSexp: |
| case NSexpdigit: |
| exp = exp*10 + (c - '0'); |
| state = NSexpdigit; |
| continue; |
| } |
| break; |
| } |
| if(c == '.') { |
| switch(state) { |
| case NSstart: |
| case NSsign: |
| state = NSpoint; |
| continue; |
| case NSzero: |
| state = NSzerofract; |
| continue; |
| case NSdigit: |
| state = NSfract; |
| continue; |
| } |
| break; |
| } |
| if((f->flags & Gflag) && (c == 'e' || c == 'E')) { |
| switch(state) { |
| case NSdigit: |
| case NSfract: |
| state = NSexp; |
| continue; |
| } |
| break; |
| } |
| break; |
| } |
| |
| switch(state) { |
| /* |
| * result is zero |
| */ |
| case NSstart: |
| case NSsign: |
| case NSzero: |
| case NSzerofract: |
| case NSpoint: |
| kp = k->key + k->klen; |
| k->klen += 2; |
| kp[0] = 0x20; /* between + and - */ |
| kp[1] = 0; |
| return; |
| /* |
| * result has exponent |
| */ |
| case NSexpsign: |
| case NSexp: |
| case NSexpdigit: |
| if(expsign) |
| exp = -exp; |
| dp += exp; |
| |
| /* |
| * result is fixed point number |
| */ |
| case NSdigit: |
| case NSfract: |
| kp -= nzero; |
| cl -= nzero; |
| break; |
| } |
| |
| /* |
| * end of number |
| */ |
| c = 0; |
| if(rflag) |
| c = ~c; |
| *kp = c; |
| |
| /* |
| * sign and exponent |
| */ |
| c = 0x30; |
| if(rflag) { |
| c = 0x10; |
| dp = ~dp; |
| } |
| kp = k->key + k->klen; |
| kp[0] = c; |
| kp[1] = (dp >> 8); |
| kp[2] = dp; |
| k->klen = cl+1; |
| } |
| |
| void |
| dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f) |
| { |
| uchar *kp; |
| Rune r, place[3]; |
| int c, cl, pc; |
| int rflag; |
| |
| rflag = f->flags&Rflag; |
| pc = 0; |
| |
| cl = k->klen; |
| kp = k->key + cl; |
| |
| for(;;) { |
| /* |
| * get the character |
| */ |
| if(lp >= lpe) |
| break; |
| c = *lp; |
| if(c >= Runeself) { |
| lp += chartorune(&r, (char*)lp); |
| c = r; |
| } else |
| lp++; |
| |
| if(c < nelem(f->mapto)) { |
| c = f->mapto[c]; |
| if(c == 0) |
| continue; |
| } |
| place[pc++] = c; |
| if(pc < 3) |
| continue; |
| for(c=11; c>=0; c--) |
| if(memcmp(month[c], place, sizeof(place)) == 0) |
| break; |
| c += 10; |
| if(rflag) |
| c = ~c; |
| *kp++ = c; |
| cl++; |
| break; |
| } |
| |
| c = 0; |
| if(rflag) |
| c = ~c; |
| *kp = c; |
| k->klen = cl+1; |
| } |
| |
| void |
| dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f) |
| { |
| uchar *kp; |
| Rune r; |
| int c, cl, n, rflag; |
| |
| cl = k->klen; |
| kp = k->key + cl; |
| rflag = f->flags & Rflag; |
| |
| for(;;) { |
| /* |
| * get the character |
| */ |
| if(lp >= lpe) |
| break; |
| c = *lp; |
| if(c >= Runeself) { |
| lp += chartorune(&r, (char*)lp); |
| c = r; |
| } else |
| lp++; |
| |
| /* |
| * do the various mappings. |
| * the common case is handled |
| * completely by the table. |
| */ |
| if(c != 0 && c < Runeself) { |
| c = f->mapto[c]; |
| if(c) { |
| *kp++ = c; |
| cl++; |
| } |
| continue; |
| } |
| |
| /* |
| * for characters out of range, |
| * the table does not do Rflag. |
| * ignore is based on mapto[255] |
| */ |
| if(c != 0 && c < nelem(f->mapto)) { |
| c = f->mapto[c]; |
| if(c == 0) |
| continue; |
| } else |
| if(f->mapto[nelem(f->mapto)-1] == 0) |
| continue; |
| |
| /* |
| * put it in the key |
| */ |
| r = c; |
| n = runetochar((char*)kp, &r); |
| kp += n; |
| cl += n; |
| if(rflag) |
| while(n > 0) { |
| kp[-n] = ~kp[-n]; |
| n--; |
| } |
| } |
| |
| /* |
| * end of key |
| */ |
| k->klen = cl+1; |
| if(rflag) { |
| *kp = ~0; |
| return; |
| } |
| *kp = 0; |
| } |
| |
| void |
| dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f) |
| { |
| int cl, n; |
| uchar *kp; |
| |
| USED(f); |
| n = lpe - lp; |
| if(n < 0) |
| n = 0; |
| cl = k->klen; |
| kp = k->key + cl; |
| k->klen = cl+n+1; |
| |
| lpe -= 3; |
| while(lp < lpe) { |
| kp[0] = ~lp[0]; |
| kp[1] = ~lp[1]; |
| kp[2] = ~lp[2]; |
| kp[3] = ~lp[3]; |
| kp += 4; |
| lp += 4; |
| } |
| |
| lpe += 3; |
| while(lp < lpe) |
| *kp++ = ~*lp++; |
| *kp = ~0; |
| } |
| |
| void |
| dokey_(Key *k, uchar *lp, uchar *lpe, Field *f) |
| { |
| int n, cl; |
| uchar *kp; |
| |
| USED(f); |
| n = lpe - lp; |
| if(n < 0) |
| n = 0; |
| cl = k->klen; |
| kp = k->key + cl; |
| k->klen = cl+n+1; |
| memmove(kp, lp, n); |
| kp[n] = 0; |
| } |
| |
| void |
| buildkey(Line *l) |
| { |
| Key *k; |
| uchar *lp, *lpe; |
| int ll, kl, cl, i, n; |
| Field *f; |
| |
| ll = l->llen - 1; |
| kl = 0; /* allocated length */ |
| cl = 0; /* current length */ |
| k = 0; |
| |
| for(i=1; i<=args.nfield; i++) { |
| f = &args.field[i]; |
| lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0); |
| if(lp == 0) |
| lp = l->line + ll; |
| lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1); |
| if(lpe == 0) |
| lpe = l->line + ll; |
| n = (lpe - lp) + 1; |
| if(n <= 0) |
| n = 1; |
| if(cl+(n+4) > kl) { |
| kl = cl+(n+4); |
| k = realloc(k, sizeof(Key) + |
| (kl-1)*sizeof(k->key[0])); |
| if(k == 0) |
| nomem(); |
| } |
| k->klen = cl; |
| (*f->dokey)(k, lp, lpe, f); |
| cl = k->klen; |
| } |
| |
| /* |
| * global comparisons |
| */ |
| if(!(args.uflag && cl > 0)) { |
| f = &args.field[0]; |
| if(cl+(ll+4) > kl) { |
| kl = cl+(ll+4); |
| k = realloc(k, sizeof(Key) + |
| (kl-1)*sizeof(k->key[0])); |
| if(k == 0) |
| nomem(); |
| } |
| k->klen = cl; |
| (*f->dokey)(k, l->line, l->line+ll, f); |
| cl = k->klen; |
| } |
| |
| l->key = k; |
| k->klen = cl; |
| |
| if(args.vflag) { |
| write(2, l->line, l->llen); |
| for(i=0; i<k->klen; i++) { |
| fprint(2, " %.2x", k->key[i]); |
| if(k->key[i] == 0x00 || k->key[i] == 0xff) |
| fprint(2, "\n"); |
| } |
| } |
| } |
| |
| void |
| makemapm(Field *f) |
| { |
| int i, c; |
| |
| for(i=0; i<nelem(f->mapto); i++) { |
| c = 1; |
| if(i == ' ' || i == '\t') |
| c = 0; |
| if(i >= 'a' && i <= 'z') |
| c = i + ('A' - 'a'); |
| if(i >= 'A' && i <= 'Z') |
| c = i; |
| f->mapto[i] = c; |
| if(args.vflag) { |
| if((i & 15) == 0) |
| fprint(2, " "); |
| fprint(2, " %.2x", c); |
| if((i & 15) == 15) |
| fprint(2, "\n"); |
| } |
| } |
| } |
| |
| void |
| makemapd(Field *f) |
| { |
| int i, j, c; |
| |
| for(i=0; i<nelem(f->mapto); i++) { |
| c = i; |
| if(f->flags & Iflag) |
| if(c < 040 || c > 0176) |
| c = -1; |
| if((f->flags & Wflag) && c >= 0) |
| if(c == ' ' || c == '\t') |
| c = -1; |
| if((f->flags & Dflag) && c >= 0) |
| if(!(c == ' ' || c == '\t' || |
| (c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9'))) { |
| for(j=0; latinmap[j]; j+=3) |
| if(c == latinmap[j+0] || |
| c == latinmap[j+1]) |
| break; |
| if(latinmap[j] == 0) |
| c = -1; |
| } |
| if((f->flags & Fflag) && c >= 0) { |
| if(c >= 'a' && c <= 'z') |
| c += 'A' - 'a'; |
| for(j=0; latinmap[j]; j+=3) |
| if(c == latinmap[j+0] || |
| c == latinmap[j+1]) { |
| c = latinmap[j+2]; |
| break; |
| } |
| } |
| if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself) |
| c = ~c & 0xff; |
| if(c < 0) |
| c = 0; |
| f->mapto[i] = c; |
| if(args.vflag) { |
| if((i & 15) == 0) |
| fprint(2, " "); |
| fprint(2, " %.2x", c); |
| if((i & 15) == 15) |
| fprint(2, "\n"); |
| } |
| } |
| } |
| |
| int latinmap[] = |
| { |
| /* lcase ucase fold */ |
| 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */ |
| 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */ |
| 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */ |
| 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */ |
| 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */ |
| 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */ |
| 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */ |
| 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */ |
| 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */ |
| 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */ |
| 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */ |
| 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */ |
| 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */ |
| 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */ |
| 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */ |
| 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */ |
| 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */ |
| 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */ |
| 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */ |
| 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */ |
| 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */ |
| 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */ |
| 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */ |
| 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */ |
| 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */ |
| 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */ |
| 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */ |
| 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */ |
| 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */ |
| 0, |
| }; |
| |
| Rune LJAN[] = { 'J', 'A', 'N', 0 }; |
| Rune LFEB[] = { 'F', 'E', 'B', 0 }; |
| Rune LMAR[] = { 'M', 'A', 'R', 0 }; |
| Rune LAPR[] = { 'A', 'P', 'R', 0 }; |
| Rune LMAY[] = { 'M', 'A', 'Y', 0 }; |
| Rune LJUN[] = { 'J', 'U', 'N', 0 }; |
| Rune LJUL[] = { 'J', 'U', 'L', 0 }; |
| Rune LAUG[] = { 'A', 'U', 'G', 0 }; |
| Rune LSEP[] = { 'S', 'E', 'P', 0 }; |
| Rune LOCT[] = { 'O', 'C', 'T', 0 }; |
| Rune LNOV[] = { 'N', 'O', 'V', 0 }; |
| Rune LDEC[] = { 'D', 'E', 'C', 0 }; |
| |
| Rune* month[12] = |
| { |
| LJAN, |
| LFEB, |
| LMAR, |
| LAPR, |
| LMAY, |
| LJUN, |
| LJUL, |
| LAUG, |
| LSEP, |
| LOCT, |
| LNOV, |
| LDEC, |
| }; |
| |
| /************** radix sort ***********/ |
| |
| enum |
| { |
| Threshold = 14 |
| }; |
| |
| void rsort4(Key***, ulong, int); |
| void bsort4(Key***, ulong, int); |
| |
| void |
| sort4(void *a, ulong n) |
| { |
| if(n > Threshold) |
| rsort4((Key***)a, n, 0); |
| else |
| bsort4((Key***)a, n, 0); |
| } |
| |
| void |
| rsort4(Key ***a, ulong n, int b) |
| { |
| Key ***ea, ***t, ***u, **t1, **u1, *k; |
| Key ***part[257]; |
| static long count[257]; |
| long clist[257+257], *cp, *cp1; |
| int c, lowc, higc; |
| |
| /* |
| * pass 1 over all keys, |
| * count the number of each key[b]. |
| * find low count and high count. |
| */ |
| lowc = 256; |
| higc = 0; |
| ea = a+n; |
| for(t=a; t<ea; t++) { |
| k = **t; |
| n = k->klen; |
| if(b >= n) { |
| count[256]++; |
| continue; |
| } |
| c = k->key[b]; |
| n = count[c]++; |
| if(n == 0) { |
| if(c < lowc) |
| lowc = c; |
| if(c > higc) |
| higc = c; |
| } |
| } |
| |
| /* |
| * pass 2 over all counts, |
| * put partition pointers in part[c]. |
| * save compacted indexes and counts |
| * in clist[]. |
| */ |
| t = a; |
| n = count[256]; |
| clist[0] = n; |
| part[256] = t; |
| t += n; |
| |
| cp1 = clist+1; |
| cp = count+lowc; |
| for(c=lowc; c<=higc; c++,cp++) { |
| n = *cp; |
| if(n) { |
| cp1[0] = n; |
| cp1[1] = c; |
| cp1 += 2; |
| part[c] = t; |
| t += n; |
| } |
| } |
| *cp1 = 0; |
| |
| /* |
| * pass 3 over all counts. |
| * chase lowest pointer in each partition |
| * around a permutation until it comes |
| * back and is stored where it started. |
| * static array, count[], should be |
| * reduced to zero entries except maybe |
| * count[256]. |
| */ |
| for(cp1=clist+1; cp1[0]; cp1+=2) { |
| c = cp1[1]; |
| cp = count+c; |
| while(*cp) { |
| t1 = *part[c]; |
| for(;;) { |
| k = *t1; |
| n = 256; |
| if(b < k->klen) |
| n = k->key[b]; |
| u = part[n]++; |
| count[n]--; |
| u1 = *u; |
| *u = t1; |
| if(n == c) |
| break; |
| t1 = u1; |
| } |
| } |
| } |
| |
| /* |
| * pass 4 over all partitions. |
| * call recursively. |
| */ |
| b++; |
| t = a + clist[0]; |
| count[256] = 0; |
| for(cp1=clist+1; n=cp1[0]; cp1+=2) { |
| if(n > Threshold) |
| rsort4(t, n, b); |
| else |
| if(n > 1) |
| bsort4(t, n, b); |
| t += n; |
| } |
| } |
| |
| /* |
| * bubble sort to pick up |
| * the pieces. |
| */ |
| void |
| bsort4(Key ***a, ulong n, int b) |
| { |
| Key ***i, ***j, ***k, ***l, **t; |
| Key *ka, *kb; |
| int n1, n2; |
| |
| l = a+n; |
| j = a; |
| |
| loop: |
| i = j; |
| j++; |
| if(j >= l) |
| return; |
| |
| ka = **i; |
| kb = **j; |
| n1 = ka->klen - b; |
| n2 = kb->klen - b; |
| if(n1 > n2) |
| n1 = n2; |
| if(n1 <= 0) |
| goto loop; |
| n2 = ka->key[b] - kb->key[b]; |
| if(n2 == 0) |
| n2 = memcmp(ka->key+b, kb->key+b, n1); |
| if(n2 <= 0) |
| goto loop; |
| |
| for(;;) { |
| k = i+1; |
| |
| t = *k; |
| *k = *i; |
| *i = t; |
| |
| if(i <= a) |
| goto loop; |
| |
| i--; |
| ka = **i; |
| kb = *t; |
| n1 = ka->klen - b; |
| n2 = kb->klen - b; |
| if(n1 > n2) |
| n1 = n2; |
| if(n1 <= 0) |
| goto loop; |
| n2 = ka->key[b] - kb->key[b]; |
| if(n2 == 0) |
| n2 = memcmp(ka->key+b, kb->key+b, n1); |
| if(n2 <= 0) |
| goto loop; |
| } |
| } |