From bc7cb1a15a67c859c8c71c4b52bb35fe9425a63d Mon Sep 17 00:00:00 2001 From: rsc Date: Sun, 23 Nov 2003 18:04:47 +0000 Subject: new utilities. the .C files compile but are renamed to avoid building automatically. --- src/cmd/sort.c | 1767 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1767 insertions(+) create mode 100644 src/cmd/sort.c (limited to 'src/cmd/sort.c') diff --git a/src/cmd/sort.c b/src/cmd/sort.c new file mode 100644 index 00000000..359965e8 --- /dev/null +++ b/src/cmd/sort.c @@ -0,0 +1,1767 @@ +#include +#include +#include + +/* +bugs: + 00/ff for end of file can conflict with 00/ff characters +*/ + +enum +{ + Nline = 100000, /* default max number of lines saved in memory */ + Nmerge = 10, /* max number of temporary files merged */ + Nfield = 20, /* max number of argument fields */ + + Bflag = 1<<0, /* flags per field */ + B1flag = 1<<1, + + Dflag = 1<<2, + Fflag = 1<<3, + Gflag = 1<<4, + Iflag = 1<<5, + Mflag = 1<<6, + Nflag = 1<<7, + Rflag = 1<<8, + Wflag = 1<<9, + + NSstart = 0, /* states for number to key decoding */ + NSsign, + NSzero, + NSdigit, + NSpoint, + NSfract, + NSzerofract, + NSexp, + NSexpsign, + NSexpdigit, +}; + +typedef struct Line Line; +typedef struct Key Key; +typedef struct Merge Merge; +typedef struct Field Field; + +struct Line +{ + Key* key; + int llen; /* always >= 1 */ + uchar line[1]; /* always ends in '\n' */ +}; + +struct Merge +{ + Key* key; /* copy of line->key so (Line*) looks like (Merge*) */ + Line* line; /* line at the head of a merged temp file */ + int fd; /* file descriptor */ + Biobuf b; /* iobuf for reading a temp file */ +}; + +struct Key +{ + int klen; + uchar key[1]; +}; + +struct Field +{ + int beg1; + int beg2; + int end1; + int end2; + + long flags; + uchar mapto[256]; + + void (*dokey)(Key*, uchar*, uchar*, Field*); +}; + +struct args +{ + char* ofile; + char* tname; + Rune tabchar; + char cflag; + char uflag; + char vflag; + int nfield; + int nfile; + Field field[Nfield]; + + Line** linep; + long nline; /* number of lines in this temp file */ + long lineno; /* overall ordinal for -s option */ + int ntemp; + long mline; /* max lines per file */ +} args; + +extern int latinmap[]; +extern Rune* month[12]; + +void buildkey(Line*); +void doargs(int, char*[]); +void dofield(char*, int*, int*, int, int); +void dofile(Biobuf*); +void dokey_(Key*, uchar*, uchar*, Field*); +void dokey_dfi(Key*, uchar*, uchar*, Field*); +void dokey_gn(Key*, uchar*, uchar*, Field*); +void dokey_m(Key*, uchar*, uchar*, Field*); +void dokey_r(Key*, uchar*, uchar*, Field*); +void done(char*); +int kcmp(Key*, Key*); +void makemapd(Field*); +void makemapm(Field*); +void mergefiles(int, int, Biobuf*); +void mergeout(Biobuf*); +void newfield(void); +Line* newline(Biobuf*); +void nomem(void); +void notifyf(void*, char*); +void printargs(void); +void printout(Biobuf*); +void setfield(int, int); +uchar* skip(uchar*, int, int, int, int); +void sort4(void*, ulong); +char* tempfile(int); +void tempout(void); +void lineout(Biobuf*, Line*); + +void +main(int argc, char *argv[]) +{ + int i, f; + char *s; + Biobuf bbuf; + + notify(notifyf); /**/ + doargs(argc, argv); + if(args.vflag) + printargs(); + + for(i=1; ikey, l->key); + if(n > 0 || (n == 0 && args.uflag)) { + fprint(2, "sort: -c file not in sort\n"); /**/ + done("order"); + } + free(ol->key); + free(ol); + ol = l; + } + return; + } + + if(args.linep == 0) { + args.linep = malloc(args.mline * sizeof(args.linep)); + if(args.linep == 0) + nomem(); + } + for(;;) { + l = newline(b); + if(l == 0) + break; + if(args.nline >= args.mline) + tempout(); + args.linep[args.nline] = l; + args.nline++; + args.lineno++; + } +} + +void +notifyf(void *a, char *s) +{ + USED(a); + if(strcmp(s, "interrupt") == 0) + done(0); + if(strcmp(s, "hangup") == 0) + done(0); + if(strcmp(s, "kill") == 0) + done(0); + if(strncmp(s, "sys: write on closed pipe", 25) == 0) + done(0); + fprint(2, "sort: note: %s\n", s); + abort(); +} + +Line* +newline(Biobuf *b) +{ + Line *l; + char *p; + int n, c; + + p = Brdline(b, '\n'); + n = Blinelen(b); + if(p == 0) { + if(n == 0) + return 0; + l = 0; + for(n=0;;) { + if((n & 31) == 0) { + l = realloc(l, sizeof(Line) + + (n+31)*sizeof(l->line[0])); + if(l == 0) + nomem(); + } + c = Bgetc(b); + if(c < 0) { + fprint(2, "sort: newline added\n"); + c = '\n'; + } + l->line[n++] = c; + if(c == '\n') + break; + } + l->llen = n; + buildkey(l); + return l; + } + l = malloc(sizeof(Line) + + (n-1)*sizeof(l->line[0])); + if(l == 0) + nomem(); + l->llen = n; + memmove(l->line, p, n); + buildkey(l); + return l; +} + +void +lineout(Biobuf *b, Line *l) +{ + int n, m; + + n = l->llen; + m = Bwrite(b, l->line, n); + if(n != m) + exits("write"); +} + +void +tempout(void) +{ + long n; + Line **lp, *l; + char *tf; + int f; + Biobuf tb; + + sort4(args.linep, args.nline); + tf = tempfile(args.ntemp); + args.ntemp++; + f = create(tf, OWRITE, 0666); + if(f < 0) { + fprint(2, "sort: create %s: %r\n", tf); + done("create"); + } + + Binit(&tb, f, OWRITE); + lp = args.linep; + for(n=args.nline; n>0; n--) { + l = *lp++; + lineout(&tb, l); + free(l->key); + free(l); + } + args.nline = 0; + Bterm(&tb); + close(f); +} + +void +done(char *xs) +{ + int i; + + for(i=0; i= nelem(file)-20) { + fprint(2, "temp file directory name is too long: %s\n", dir); + done("tdir"); + } + + if(pid == 0) { + pid = getpid(); + if(pid == 0) { + pid = time(0); + if(pid == 0) + pid = 1; + } + } + + sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n); + return file; +} + +void +mergeout(Biobuf *b) +{ + int n, i, f; + char *tf; + Biobuf tb; + + for(i=0; i Nmerge) { + tf = tempfile(args.ntemp); + args.ntemp++; + f = create(tf, OWRITE, 0666); + if(f < 0) { + fprint(2, "sort: create %s: %r\n", tf); + done("create"); + } + Binit(&tb, f, OWRITE); + + n = Nmerge; + mergefiles(i, n, &tb); + + Bterm(&tb); + close(f); + } else + mergefiles(i, n, b); + } +} + +void +mergefiles(int t, int n, Biobuf *b) +{ + Merge *m, *mp, **mmp; + Key *ok; + Line *l; + char *tf; + int i, f, nn; + + mmp = malloc(n*sizeof(*mmp)); + mp = malloc(n*sizeof(*mp)); + if(mmp == 0 || mp == 0) + nomem(); + + nn = 0; + m = mp; + for(i=0; ifd = f; + Binit(&m->b, f, OREAD); + mmp[nn] = m; + + l = newline(&m->b); + if(l == 0) + continue; + nn++; + m->line = l; + m->key = l->key; + } + + ok = 0; + for(;;) { + sort4(mmp, nn); + m = *mmp; + if(nn == 0) + break; + for(;;) { + l = m->line; + if(args.uflag && ok && kcmp(ok, l->key) == 0) { + free(l->key); + free(l); + } else { + lineout(b, l); + if(ok) + free(ok); + ok = l->key; + free(l); + } + + l = newline(&m->b); + if(l == 0) { + nn--; + mmp[0] = mmp[nn]; + break; + } + m->line = l; + m->key = l->key; + if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0) + break; + } + } + if(ok) + free(ok); + + m = mp; + for(i=0; ib); + close(m->fd); + } + + free(mp); + free(mmp); +} + +int +kcmp(Key *ka, Key *kb) +{ + int n, m; + + /* + * set n to length of smaller key + */ + n = ka->klen; + m = kb->klen; + if(n > m) + n = m; + return memcmp(ka->key, kb->key, n); +} + +void +printout(Biobuf *b) +{ + long n; + Line **lp, *l; + Key *ok; + + sort4(args.linep, args.nline); + lp = args.linep; + ok = 0; + for(n=args.nline; n>0; n--) { + l = *lp++; + if(args.uflag && ok && kcmp(ok, l->key) == 0) + continue; + lineout(b, l); + ok = l->key; + } +} + +void +setfield(int n, int c) +{ + Field *f; + + f = &args.field[n]; + switch(c) { + default: + fprint(2, "sort: unknown option: field.%C\n", c); + done("option"); + case 'b': /* skip blanks */ + f->flags |= Bflag; + break; + case 'd': /* directory order */ + f->flags |= Dflag; + break; + case 'f': /* fold case */ + f->flags |= Fflag; + break; + case 'g': /* floating point -n case */ + f->flags |= Gflag; + break; + case 'i': /* ignore non-ascii */ + f->flags |= Iflag; + break; + case 'M': /* month */ + f->flags |= Mflag; + break; + case 'n': /* numbers */ + f->flags |= Nflag; + break; + case 'r': /* reverse */ + f->flags |= Rflag; + break; + case 'w': /* ignore white */ + f->flags |= Wflag; + break; + } +} + +void +dofield(char *s, int *n1, int *n2, int off1, int off2) +{ + int c, n; + + c = *s++; + if(c >= '0' && c <= '9') { + n = 0; + while(c >= '0' && c <= '9') { + n = n*10 + (c-'0'); + c = *s++; + } + n -= off1; /* posix committee: rot in hell */ + if(n < 0) { + fprint(2, "sort: field offset must be positive\n"); + done("option"); + } + *n1 = n; + } + if(c == '.') { + c = *s++; + if(c >= '0' && c <= '9') { + n = 0; + while(c >= '0' && c <= '9') { + n = n*10 + (c-'0'); + c = *s++; + } + n -= off2; + if(n < 0) { + fprint(2, "sort: character offset must be positive\n"); + done("option"); + } + *n2 = n; + } + } + while(c != 0) { + setfield(args.nfield, c); + c = *s++; + } +} + +void +printargs(void) +{ + int i, n; + Field *f; + char *prefix; + + fprint(2, "sort"); + for(i=0; i<=args.nfield; i++) { + f = &args.field[i]; + prefix = " -"; + if(i) { + n = f->beg1; + if(n >= 0) + fprint(2, " +%d", n); + else + fprint(2, " +*"); + n = f->beg2; + if(n >= 0) + fprint(2, ".%d", n); + else + fprint(2, ".*"); + + if(f->flags & B1flag) + fprint(2, "b"); + + n = f->end1; + if(n >= 0) + fprint(2, " -%d", n); + else + fprint(2, " -*"); + n = f->end2; + if(n >= 0) + fprint(2, ".%d", n); + else + fprint(2, ".*"); + prefix = ""; + } + if(f->flags & Bflag) + fprint(2, "%sb", prefix); + if(f->flags & Dflag) + fprint(2, "%sd", prefix); + if(f->flags & Fflag) + fprint(2, "%sf", prefix); + if(f->flags & Gflag) + fprint(2, "%sg", prefix); + if(f->flags & Iflag) + fprint(2, "%si", prefix); + if(f->flags & Mflag) + fprint(2, "%sM", prefix); + if(f->flags & Nflag) + fprint(2, "%sn", prefix); + if(f->flags & Rflag) + fprint(2, "%sr", prefix); + if(f->flags & Wflag) + fprint(2, "%sw", prefix); + } + if(args.cflag) + fprint(2, " -c"); + if(args.uflag) + fprint(2, " -u"); + if(args.ofile) + fprint(2, " -o %s", args.ofile); + if(args.mline != Nline) + fprint(2, " -l %ld", args.mline); + fprint(2, "\n"); +} + +void +newfield(void) +{ + int n; + Field *f; + + n = args.nfield + 1; + if(n >= Nfield) { + fprint(2, "sort: too many fields specified\n"); + done("option"); + } + args.nfield = n; + f = &args.field[n]; + f->beg1 = -1; + f->beg2 = -1; + f->end1 = -1; + f->end2 = -1; +} + +void +doargs(int argc, char *argv[]) +{ + int i, c, hadplus; + char *s, *p, *q; + Field *f; + + hadplus = 0; + args.mline = Nline; + for(i=1; i= '0' && c <= '9')) { + if(!hadplus) + newfield(); + f = &args.field[args.nfield]; + dofield(s, &f->end1, &f->end2, 0, 0); + hadplus = 0; + continue; + } + + while(c = *s++) + switch(c) { + case '-': /* end of options */ + i = argc; + continue; + case 'T': /* temp directory */ + if(*s == 0) { + i++; + if(i < argc) { + args.tname = argv[i]; + argv[i] = 0; + } + } else + args.tname = s; + s = strchr(s, 0); + break; + case 'o': /* output file */ + if(*s == 0) { + i++; + if(i < argc) { + args.ofile = argv[i]; + argv[i] = 0; + } + } else + args.ofile = s; + s = strchr(s, 0); + break; + case 'k': /* posix key (what were they thinking?) */ + p = 0; + if(*s == 0) { + i++; + if(i < argc) { + p = argv[i]; + argv[i] = 0; + } + } else + p = s; + s = strchr(s, 0); + if(p == 0) + break; + + newfield(); + q = strchr(p, ','); + if(q) + *q++ = 0; + f = &args.field[args.nfield]; + dofield(p, &f->beg1, &f->beg2, 1, 1); + if(f->flags & Bflag) { + f->flags |= B1flag; + f->flags &= ~Bflag; + } + if(q) { + dofield(q, &f->end1, &f->end2, 1, 0); + if(f->end2 <= 0) + f->end1++; + } + hadplus = 0; + break; + case 't': /* tab character */ + if(*s == 0) { + i++; + if(i < argc) { + chartorune(&args.tabchar, argv[i]); + argv[i] = 0; + } + } else + s += chartorune(&args.tabchar, s); + if(args.tabchar == '\n') { + fprint(2, "aw come on, rob\n"); + done("rob"); + } + break; + case 'c': /* check order */ + args.cflag = 1; + break; + case 'u': /* unique */ + args.uflag = 1; + break; + case 'v': /* debugging noise */ + args.vflag = 1; + break; + case 'l': + if(*s == 0) { + i++; + if(i < argc) { + args.mline = atol(argv[i]); + argv[i] = 0; + } + } else + args.mline = atol(s); + s = strchr(s, 0); + break; + + case 'M': /* month */ + case 'b': /* skip blanks */ + case 'd': /* directory order */ + case 'f': /* fold case */ + case 'g': /* floating numbers */ + case 'i': /* ignore non-ascii */ + case 'n': /* numbers */ + case 'r': /* reverse */ + case 'w': /* ignore white */ + if(args.nfield > 0) + fprint(2, "sort: global field set after -k\n"); + setfield(0, c); + break; + case 'm': + /* option m silently ignored but required by posix */ + break; + default: + fprint(2, "sort: unknown option: -%C\n", c); + done("option"); + } + continue; + } + if(c == '+') { + argv[i] = 0; /* clobber args processed */ + c = *s; + if(c == '.' || (c >= '0' && c <= '9')) { + newfield(); + f = &args.field[args.nfield]; + dofield(s, &f->beg1, &f->beg2, 0, 0); + if(f->flags & Bflag) { + f->flags |= B1flag; + f->flags &= ~Bflag; + } + hadplus = 1; + continue; + } + fprint(2, "sort: unknown option: +%C\n", c); + done("option"); + } + args.nfile++; + } + + for(i=0; i<=args.nfield; i++) { + f = &args.field[i]; + + /* + * global options apply to fields that + * specify no options + */ + if(f->flags == 0) { + f->flags = args.field[0].flags; + if(args.field[0].flags & Bflag) + f->flags |= B1flag; + } + + + /* + * build buildkey specification + */ + switch(f->flags & ~(Bflag|B1flag)) { + default: + fprint(2, "sort: illegal combination of flags: %lx\n", f->flags); + done("option"); + case 0: + f->dokey = dokey_; + break; + case Rflag: + f->dokey = dokey_r; + break; + case Gflag: + case Nflag: + case Gflag|Nflag: + case Gflag|Rflag: + case Nflag|Rflag: + case Gflag|Nflag|Rflag: + f->dokey = dokey_gn; + break; + case Mflag: + case Mflag|Rflag: + f->dokey = dokey_m; + makemapm(f); + break; + case Dflag: + case Dflag|Fflag: + case Dflag|Fflag|Iflag: + case Dflag|Fflag|Iflag|Rflag: + case Dflag|Fflag|Iflag|Rflag|Wflag: + case Dflag|Fflag|Iflag|Wflag: + case Dflag|Fflag|Rflag: + case Dflag|Fflag|Rflag|Wflag: + case Dflag|Fflag|Wflag: + case Dflag|Iflag: + case Dflag|Iflag|Rflag: + case Dflag|Iflag|Rflag|Wflag: + case Dflag|Iflag|Wflag: + case Dflag|Rflag: + case Dflag|Rflag|Wflag: + case Dflag|Wflag: + case Fflag: + case Fflag|Iflag: + case Fflag|Iflag|Rflag: + case Fflag|Iflag|Rflag|Wflag: + case Fflag|Iflag|Wflag: + case Fflag|Rflag: + case Fflag|Rflag|Wflag: + case Fflag|Wflag: + case Iflag: + case Iflag|Rflag: + case Iflag|Rflag|Wflag: + case Iflag|Wflag: + case Wflag: + f->dokey = dokey_dfi; + makemapd(f); + break; + } + } + + /* + * random spot checks + */ + if(args.nfile > 1 && args.cflag) { + fprint(2, "sort: -c can have at most one input file\n"); + done("option"); + } + return; +} + +uchar* +skip(uchar *l, int n1, int n2, int bflag, int endfield) +{ + int i, c, tc; + Rune r; + + if(endfield && n1 < 0) + return 0; + + c = *l++; + tc = args.tabchar; + if(tc) { + if(tc < Runeself) { + for(i=n1; i>0; i--) { + while(c != tc) { + if(c == '\n') + return 0; + c = *l++; + } + if(!(endfield && i == 1)) + c = *l++; + } + } else { + l--; + l += chartorune(&r, (char*)l); + for(i=n1; i>0; i--) { + while(r != tc) { + if(r == '\n') + return 0; + l += chartorune(&r, (char*)l); + } + if(!(endfield && i == 1)) + l += chartorune(&r, (char*)l); + } + c = r; + } + } else { + for(i=n1; i>0; i--) { + while(c == ' ' || c == '\t') + c = *l++; + while(c != ' ' && c != '\t') { + if(c == '\n') + return 0; + c = *l++; + } + } + } + + if(bflag) + while(c == ' ' || c == '\t') + c = *l++; + + l--; + for(i=n2; i>0; i--) { + c = *l; + if(c < Runeself) { + if(c == '\n') + return 0; + l++; + continue; + } + l += chartorune(&r, (char*)l); + } + return l; +} + +void +dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f) +{ + uchar *kp; + int c, cl, dp; + int state, nzero, exp, expsign, rflag; + + cl = k->klen + 3; + kp = k->key + cl; /* skip place for sign, exponent[2] */ + + nzero = 0; /* number of trailing zeros */ + exp = 0; /* value of the exponent */ + expsign = 0; /* sign of the exponent */ + dp = 0x4040; /* location of decimal point */ + rflag = f->flags&Rflag; /* xor of rflag and - sign */ + state = NSstart; + + for(;; lp++) { + if(lp >= lpe) + break; + c = *lp; + + if(c == ' ' || c == '\t') { + switch(state) { + case NSstart: + case NSsign: + continue; + } + break; + } + if(c == '+' || c == '-') { + switch(state) { + case NSstart: + state = NSsign; + if(c == '-') + rflag = !rflag; + continue; + case NSexp: + state = NSexpsign; + if(c == '-') + expsign = 1; + continue; + } + break; + } + if(c == '0') { + switch(state) { + case NSdigit: + if(rflag) + c = ~c; + *kp++ = c; + cl++; + nzero++; + dp++; + state = NSdigit; + continue; + case NSfract: + if(rflag) + c = ~c; + *kp++ = c; + cl++; + nzero++; + state = NSfract; + continue; + case NSstart: + case NSsign: + case NSzero: + state = NSzero; + continue; + case NSzerofract: + case NSpoint: + dp--; + state = NSzerofract; + continue; + case NSexpsign: + case NSexp: + case NSexpdigit: + exp = exp*10 + (c - '0'); + state = NSexpdigit; + continue; + } + break; + } + if(c >= '1' && c <= '9') { + switch(state) { + case NSzero: + case NSstart: + case NSsign: + case NSdigit: + if(rflag) + c = ~c; + *kp++ = c; + cl++; + nzero = 0; + dp++; + state = NSdigit; + continue; + case NSzerofract: + case NSpoint: + case NSfract: + if(rflag) + c = ~c; + *kp++ = c; + cl++; + nzero = 0; + state = NSfract; + continue; + case NSexpsign: + case NSexp: + case NSexpdigit: + exp = exp*10 + (c - '0'); + state = NSexpdigit; + continue; + } + break; + } + if(c == '.') { + switch(state) { + case NSstart: + case NSsign: + state = NSpoint; + continue; + case NSzero: + state = NSzerofract; + continue; + case NSdigit: + state = NSfract; + continue; + } + break; + } + if((f->flags & Gflag) && (c == 'e' || c == 'E')) { + switch(state) { + case NSdigit: + case NSfract: + state = NSexp; + continue; + } + break; + } + break; + } + + switch(state) { + /* + * result is zero + */ + case NSstart: + case NSsign: + case NSzero: + case NSzerofract: + case NSpoint: + kp = k->key + k->klen; + k->klen += 2; + kp[0] = 0x20; /* between + and - */ + kp[1] = 0; + return; + /* + * result has exponent + */ + case NSexpsign: + case NSexp: + case NSexpdigit: + if(expsign) + exp = -exp; + dp += exp; + + /* + * result is fixed point number + */ + case NSdigit: + case NSfract: + kp -= nzero; + cl -= nzero; + break; + } + + /* + * end of number + */ + c = 0; + if(rflag) + c = ~c; + *kp = c; + + /* + * sign and exponent + */ + c = 0x30; + if(rflag) { + c = 0x10; + dp = ~dp; + } + kp = k->key + k->klen; + kp[0] = c; + kp[1] = (dp >> 8); + kp[2] = dp; + k->klen = cl+1; +} + +void +dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f) +{ + uchar *kp; + Rune r, place[3]; + int c, cl, pc; + int rflag; + + rflag = f->flags&Rflag; + pc = 0; + + cl = k->klen; + kp = k->key + cl; + + for(;;) { + /* + * get the character + */ + if(lp >= lpe) + break; + c = *lp; + if(c >= Runeself) { + lp += chartorune(&r, (char*)lp); + c = r; + } else + lp++; + + if(c < nelem(f->mapto)) { + c = f->mapto[c]; + if(c == 0) + continue; + } + place[pc++] = c; + if(pc < 3) + continue; + for(c=11; c>=0; c--) + if(memcmp(month[c], place, sizeof(place)) == 0) + break; + c += 10; + if(rflag) + c = ~c; + *kp++ = c; + cl++; + break; + } + + c = 0; + if(rflag) + c = ~c; + *kp = c; + k->klen = cl+1; +} + +void +dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f) +{ + uchar *kp; + Rune r; + int c, cl, n, rflag; + + cl = k->klen; + kp = k->key + cl; + rflag = f->flags & Rflag; + + for(;;) { + /* + * get the character + */ + if(lp >= lpe) + break; + c = *lp; + if(c >= Runeself) { + lp += chartorune(&r, (char*)lp); + c = r; + } else + lp++; + + /* + * do the various mappings. + * the common case is handled + * completely by the table. + */ + if(c != 0 && c < Runeself) { + c = f->mapto[c]; + if(c) { + *kp++ = c; + cl++; + } + continue; + } + + /* + * for characters out of range, + * the table does not do Rflag. + * ignore is based on mapto[255] + */ + if(c != 0 && c < nelem(f->mapto)) { + c = f->mapto[c]; + if(c == 0) + continue; + } else + if(f->mapto[nelem(f->mapto)-1] == 0) + continue; + + /* + * put it in the key + */ + r = c; + n = runetochar((char*)kp, &r); + kp += n; + cl += n; + if(rflag) + while(n > 0) { + kp[-n] = ~kp[-n]; + n--; + } + } + + /* + * end of key + */ + k->klen = cl+1; + if(rflag) { + *kp = ~0; + return; + } + *kp = 0; +} + +void +dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f) +{ + int cl, n; + uchar *kp; + + USED(f); + n = lpe - lp; + if(n < 0) + n = 0; + cl = k->klen; + kp = k->key + cl; + k->klen = cl+n+1; + + lpe -= 3; + while(lp < lpe) { + kp[0] = ~lp[0]; + kp[1] = ~lp[1]; + kp[2] = ~lp[2]; + kp[3] = ~lp[3]; + kp += 4; + lp += 4; + } + + lpe += 3; + while(lp < lpe) + *kp++ = ~*lp++; + *kp = ~0; +} + +void +dokey_(Key *k, uchar *lp, uchar *lpe, Field *f) +{ + int n, cl; + uchar *kp; + + USED(f); + n = lpe - lp; + if(n < 0) + n = 0; + cl = k->klen; + kp = k->key + cl; + k->klen = cl+n+1; + memmove(kp, lp, n); + kp[n] = 0; +} + +void +buildkey(Line *l) +{ + Key *k; + uchar *lp, *lpe; + int ll, kl, cl, i, n; + Field *f; + + ll = l->llen - 1; + kl = 0; /* allocated length */ + cl = 0; /* current length */ + k = 0; + + for(i=1; i<=args.nfield; i++) { + f = &args.field[i]; + lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0); + if(lp == 0) + lp = l->line + ll; + lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1); + if(lpe == 0) + lpe = l->line + ll; + n = (lpe - lp) + 1; + if(n <= 0) + n = 1; + if(cl+(n+4) > kl) { + kl = cl+(n+4); + k = realloc(k, sizeof(Key) + + (kl-1)*sizeof(k->key[0])); + if(k == 0) + nomem(); + } + k->klen = cl; + (*f->dokey)(k, lp, lpe, f); + cl = k->klen; + } + + /* + * global comparisons + */ + if(!(args.uflag && cl > 0)) { + f = &args.field[0]; + if(cl+(ll+4) > kl) { + kl = cl+(ll+4); + k = realloc(k, sizeof(Key) + + (kl-1)*sizeof(k->key[0])); + if(k == 0) + nomem(); + } + k->klen = cl; + (*f->dokey)(k, l->line, l->line+ll, f); + cl = k->klen; + } + + l->key = k; + k->klen = cl; + + if(args.vflag) { + write(2, l->line, l->llen); + for(i=0; iklen; i++) { + fprint(2, " %.2x", k->key[i]); + if(k->key[i] == 0x00 || k->key[i] == 0xff) + fprint(2, "\n"); + } + } +} + +void +makemapm(Field *f) +{ + int i, c; + + for(i=0; imapto); i++) { + c = 1; + if(i == ' ' || i == '\t') + c = 0; + if(i >= 'a' && i <= 'z') + c = i + ('A' - 'a'); + if(i >= 'A' && i <= 'Z') + c = i; + f->mapto[i] = c; + if(args.vflag) { + if((i & 15) == 0) + fprint(2, " "); + fprint(2, " %.2x", c); + if((i & 15) == 15) + fprint(2, "\n"); + } + } +} + +void +makemapd(Field *f) +{ + int i, j, c; + + for(i=0; imapto); i++) { + c = i; + if(f->flags & Iflag) + if(c < 040 || c > 0176) + c = -1; + if((f->flags & Wflag) && c >= 0) + if(c == ' ' || c == '\t') + c = -1; + if((f->flags & Dflag) && c >= 0) + if(!(c == ' ' || c == '\t' || + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9'))) { + for(j=0; latinmap[j]; j+=3) + if(c == latinmap[j+0] || + c == latinmap[j+1]) + break; + if(latinmap[j] == 0) + c = -1; + } + if((f->flags & Fflag) && c >= 0) { + if(c >= 'a' && c <= 'z') + c += 'A' - 'a'; + for(j=0; latinmap[j]; j+=3) + if(c == latinmap[j+0] || + c == latinmap[j+1]) { + c = latinmap[j+2]; + break; + } + } + if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself) + c = ~c & 0xff; + if(c < 0) + c = 0; + f->mapto[i] = c; + if(args.vflag) { + if((i & 15) == 0) + fprint(2, " "); + fprint(2, " %.2x", c); + if((i & 15) == 15) + fprint(2, "\n"); + } + } +} + +int latinmap[] = +{ +/* lcase ucase fold */ + 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */ + 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */ + 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */ + 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */ + 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */ + 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */ + 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */ + 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */ + 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */ + 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */ + 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */ + 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */ + 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */ + 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */ + 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */ + 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */ + 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */ + 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */ + 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */ + 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */ + 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */ + 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */ + 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */ + 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */ + 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */ + 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */ + 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */ + 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */ + 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */ + 0, +}; + +Rune LJAN[] = { 'J', 'A', 'N', 0 }; +Rune LFEB[] = { 'F', 'E', 'B', 0 }; +Rune LMAR[] = { 'M', 'A', 'R', 0 }; +Rune LAPR[] = { 'A', 'P', 'R', 0 }; +Rune LMAY[] = { 'M', 'A', 'Y', 0 }; +Rune LJUN[] = { 'J', 'U', 'N', 0 }; +Rune LJUL[] = { 'J', 'U', 'L', 0 }; +Rune LAUG[] = { 'A', 'U', 'G', 0 }; +Rune LSEP[] = { 'S', 'E', 'P', 0 }; +Rune LOCT[] = { 'O', 'C', 'T', 0 }; +Rune LNOV[] = { 'N', 'O', 'V', 0 }; +Rune LDEC[] = { 'D', 'E', 'C', 0 }; + +Rune* month[12] = +{ + LJAN, + LFEB, + LMAR, + LAPR, + LMAY, + LJUN, + LJUL, + LAUG, + LSEP, + LOCT, + LNOV, + LDEC, +}; + +/************** radix sort ***********/ + +enum +{ + Threshold = 14, +}; + +void rsort4(Key***, ulong, int); +void bsort4(Key***, ulong, int); + +void +sort4(void *a, ulong n) +{ + if(n > Threshold) + rsort4((Key***)a, n, 0); + else + bsort4((Key***)a, n, 0); +} + +void +rsort4(Key ***a, ulong n, int b) +{ + Key ***ea, ***t, ***u, **t1, **u1, *k; + Key ***part[257]; + static long count[257]; + long clist[257+257], *cp, *cp1; + int c, lowc, higc; + + /* + * pass 1 over all keys, + * count the number of each key[b]. + * find low count and high count. + */ + lowc = 256; + higc = 0; + ea = a+n; + for(t=a; tklen; + if(b >= n) { + count[256]++; + continue; + } + c = k->key[b]; + n = count[c]++; + if(n == 0) { + if(c < lowc) + lowc = c; + if(c > higc) + higc = c; + } + } + + /* + * pass 2 over all counts, + * put partition pointers in part[c]. + * save compacted indexes and counts + * in clist[]. + */ + t = a; + n = count[256]; + clist[0] = n; + part[256] = t; + t += n; + + cp1 = clist+1; + cp = count+lowc; + for(c=lowc; c<=higc; c++,cp++) { + n = *cp; + if(n) { + cp1[0] = n; + cp1[1] = c; + cp1 += 2; + part[c] = t; + t += n; + } + } + *cp1 = 0; + + /* + * pass 3 over all counts. + * chase lowest pointer in each partition + * around a permutation until it comes + * back and is stored where it started. + * static array, count[], should be + * reduced to zero entries except maybe + * count[256]. + */ + for(cp1=clist+1; cp1[0]; cp1+=2) { + c = cp1[1]; + cp = count+c; + while(*cp) { + t1 = *part[c]; + for(;;) { + k = *t1; + n = 256; + if(b < k->klen) + n = k->key[b]; + u = part[n]++; + count[n]--; + u1 = *u; + *u = t1; + if(n == c) + break; + t1 = u1; + } + } + } + + /* + * pass 4 over all partitions. + * call recursively. + */ + b++; + t = a + clist[0]; + count[256] = 0; + for(cp1=clist+1; n=cp1[0]; cp1+=2) { + if(n > Threshold) + rsort4(t, n, b); + else + if(n > 1) + bsort4(t, n, b); + t += n; + } +} + +/* + * bubble sort to pick up + * the pieces. + */ +void +bsort4(Key ***a, ulong n, int b) +{ + Key ***i, ***j, ***k, ***l, **t; + Key *ka, *kb; + int n1, n2; + + l = a+n; + j = a; + +loop: + i = j; + j++; + if(j >= l) + return; + + ka = **i; + kb = **j; + n1 = ka->klen - b; + n2 = kb->klen - b; + if(n1 > n2) + n1 = n2; + if(n1 <= 0) + goto loop; + n2 = ka->key[b] - kb->key[b]; + if(n2 == 0) + n2 = memcmp(ka->key+b, kb->key+b, n1); + if(n2 <= 0) + goto loop; + + for(;;) { + k = i+1; + + t = *k; + *k = *i; + *i = t; + + if(i <= a) + goto loop; + + i--; + ka = **i; + kb = *t; + n1 = ka->klen - b; + n2 = kb->klen - b; + if(n1 > n2) + n1 = n2; + if(n1 <= 0) + goto loop; + n2 = ka->key[b] - kb->key[b]; + if(n2 == 0) + n2 = memcmp(ka->key+b, kb->key+b, n1); + if(n2 <= 0) + goto loop; + } +} -- cgit v1.2.3