aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/sort.c
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2003-11-23 18:04:47 +0000
committerrsc <devnull@localhost>2003-11-23 18:04:47 +0000
commitbc7cb1a15a67c859c8c71c4b52bb35fe9425a63d (patch)
tree8ca0fe4e2418e6aa18dc74a236c577a719f6c6ed /src/cmd/sort.c
parentf08fdedcee12c06e3ce9ac9bec363915978e8289 (diff)
downloadplan9port-bc7cb1a15a67c859c8c71c4b52bb35fe9425a63d.tar.gz
plan9port-bc7cb1a15a67c859c8c71c4b52bb35fe9425a63d.tar.bz2
plan9port-bc7cb1a15a67c859c8c71c4b52bb35fe9425a63d.zip
new utilities.
the .C files compile but are renamed to avoid building automatically.
Diffstat (limited to 'src/cmd/sort.c')
-rw-r--r--src/cmd/sort.c1767
1 files changed, 1767 insertions, 0 deletions
diff --git a/src/cmd/sort.c b/src/cmd/sort.c
new file mode 100644
index 00000000..359965e8
--- /dev/null
+++ b/src/cmd/sort.c
@@ -0,0 +1,1767 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+/*
+bugs:
+ 00/ff for end of file can conflict with 00/ff characters
+*/
+
+enum
+{
+ Nline = 100000, /* default max number of lines saved in memory */
+ Nmerge = 10, /* max number of temporary files merged */
+ Nfield = 20, /* max number of argument fields */
+
+ Bflag = 1<<0, /* flags per field */
+ B1flag = 1<<1,
+
+ Dflag = 1<<2,
+ Fflag = 1<<3,
+ Gflag = 1<<4,
+ Iflag = 1<<5,
+ Mflag = 1<<6,
+ Nflag = 1<<7,
+ Rflag = 1<<8,
+ Wflag = 1<<9,
+
+ NSstart = 0, /* states for number to key decoding */
+ NSsign,
+ NSzero,
+ NSdigit,
+ NSpoint,
+ NSfract,
+ NSzerofract,
+ NSexp,
+ NSexpsign,
+ NSexpdigit,
+};
+
+typedef struct Line Line;
+typedef struct Key Key;
+typedef struct Merge Merge;
+typedef struct Field Field;
+
+struct Line
+{
+ Key* key;
+ int llen; /* always >= 1 */
+ uchar line[1]; /* always ends in '\n' */
+};
+
+struct Merge
+{
+ Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
+ Line* line; /* line at the head of a merged temp file */
+ int fd; /* file descriptor */
+ Biobuf b; /* iobuf for reading a temp file */
+};
+
+struct Key
+{
+ int klen;
+ uchar key[1];
+};
+
+struct Field
+{
+ int beg1;
+ int beg2;
+ int end1;
+ int end2;
+
+ long flags;
+ uchar mapto[256];
+
+ void (*dokey)(Key*, uchar*, uchar*, Field*);
+};
+
+struct args
+{
+ char* ofile;
+ char* tname;
+ Rune tabchar;
+ char cflag;
+ char uflag;
+ char vflag;
+ int nfield;
+ int nfile;
+ Field field[Nfield];
+
+ Line** linep;
+ long nline; /* number of lines in this temp file */
+ long lineno; /* overall ordinal for -s option */
+ int ntemp;
+ long mline; /* max lines per file */
+} args;
+
+extern int latinmap[];
+extern Rune* month[12];
+
+void buildkey(Line*);
+void doargs(int, char*[]);
+void dofield(char*, int*, int*, int, int);
+void dofile(Biobuf*);
+void dokey_(Key*, uchar*, uchar*, Field*);
+void dokey_dfi(Key*, uchar*, uchar*, Field*);
+void dokey_gn(Key*, uchar*, uchar*, Field*);
+void dokey_m(Key*, uchar*, uchar*, Field*);
+void dokey_r(Key*, uchar*, uchar*, Field*);
+void done(char*);
+int kcmp(Key*, Key*);
+void makemapd(Field*);
+void makemapm(Field*);
+void mergefiles(int, int, Biobuf*);
+void mergeout(Biobuf*);
+void newfield(void);
+Line* newline(Biobuf*);
+void nomem(void);
+void notifyf(void*, char*);
+void printargs(void);
+void printout(Biobuf*);
+void setfield(int, int);
+uchar* skip(uchar*, int, int, int, int);
+void sort4(void*, ulong);
+char* tempfile(int);
+void tempout(void);
+void lineout(Biobuf*, Line*);
+
+void
+main(int argc, char *argv[])
+{
+ int i, f;
+ char *s;
+ Biobuf bbuf;
+
+ notify(notifyf); /**/
+ doargs(argc, argv);
+ if(args.vflag)
+ printargs();
+
+ for(i=1; i<argc; i++) {
+ s = argv[i];
+ if(s == 0)
+ continue;
+ if(strcmp(s, "-") == 0) {
+ Binit(&bbuf, 0, OREAD);
+ dofile(&bbuf);
+ Bterm(&bbuf);
+ continue;
+ }
+ f = open(s, OREAD);
+ if(f < 0) {
+ fprint(2, "sort: open %s: %r\n", s);
+ done("open");
+ }
+ Binit(&bbuf, f, OREAD);
+ dofile(&bbuf);
+ Bterm(&bbuf);
+ close(f);
+ }
+ if(args.nfile == 0) {
+ Binit(&bbuf, 0, OREAD);
+ dofile(&bbuf);
+ Bterm(&bbuf);
+ }
+ if(args.cflag)
+ done(0);
+ if(args.vflag)
+ fprint(2, "=========\n");
+
+ f = 1;
+ if(args.ofile) {
+ f = create(args.ofile, OWRITE, 0666);
+ if(f < 0) {
+ fprint(2, "sort: create %s: %r\n", args.ofile);
+ done("create");
+ }
+ }
+
+ Binit(&bbuf, f, OWRITE);
+ if(args.ntemp) {
+ tempout();
+ mergeout(&bbuf);
+ } else {
+ printout(&bbuf);
+ }
+ Bterm(&bbuf);
+ done(0);
+}
+
+void
+dofile(Biobuf *b)
+{
+ Line *l, *ol;
+ int n;
+
+ if(args.cflag) {
+ ol = newline(b);
+ if(ol == 0)
+ return;
+ for(;;) {
+ l = newline(b);
+ if(l == 0)
+ break;
+ n = kcmp(ol->key, l->key);
+ if(n > 0 || (n == 0 && args.uflag)) {
+ fprint(2, "sort: -c file not in sort\n"); /**/
+ done("order");
+ }
+ free(ol->key);
+ free(ol);
+ ol = l;
+ }
+ return;
+ }
+
+ if(args.linep == 0) {
+ args.linep = malloc(args.mline * sizeof(args.linep));
+ if(args.linep == 0)
+ nomem();
+ }
+ for(;;) {
+ l = newline(b);
+ if(l == 0)
+ break;
+ if(args.nline >= args.mline)
+ tempout();
+ args.linep[args.nline] = l;
+ args.nline++;
+ args.lineno++;
+ }
+}
+
+void
+notifyf(void *a, char *s)
+{
+ USED(a);
+ if(strcmp(s, "interrupt") == 0)
+ done(0);
+ if(strcmp(s, "hangup") == 0)
+ done(0);
+ if(strcmp(s, "kill") == 0)
+ done(0);
+ if(strncmp(s, "sys: write on closed pipe", 25) == 0)
+ done(0);
+ fprint(2, "sort: note: %s\n", s);
+ abort();
+}
+
+Line*
+newline(Biobuf *b)
+{
+ Line *l;
+ char *p;
+ int n, c;
+
+ p = Brdline(b, '\n');
+ n = Blinelen(b);
+ if(p == 0) {
+ if(n == 0)
+ return 0;
+ l = 0;
+ for(n=0;;) {
+ if((n & 31) == 0) {
+ l = realloc(l, sizeof(Line) +
+ (n+31)*sizeof(l->line[0]));
+ if(l == 0)
+ nomem();
+ }
+ c = Bgetc(b);
+ if(c < 0) {
+ fprint(2, "sort: newline added\n");
+ c = '\n';
+ }
+ l->line[n++] = c;
+ if(c == '\n')
+ break;
+ }
+ l->llen = n;
+ buildkey(l);
+ return l;
+ }
+ l = malloc(sizeof(Line) +
+ (n-1)*sizeof(l->line[0]));
+ if(l == 0)
+ nomem();
+ l->llen = n;
+ memmove(l->line, p, n);
+ buildkey(l);
+ return l;
+}
+
+void
+lineout(Biobuf *b, Line *l)
+{
+ int n, m;
+
+ n = l->llen;
+ m = Bwrite(b, l->line, n);
+ if(n != m)
+ exits("write");
+}
+
+void
+tempout(void)
+{
+ long n;
+ Line **lp, *l;
+ char *tf;
+ int f;
+ Biobuf tb;
+
+ sort4(args.linep, args.nline);
+ tf = tempfile(args.ntemp);
+ args.ntemp++;
+ f = create(tf, OWRITE, 0666);
+ if(f < 0) {
+ fprint(2, "sort: create %s: %r\n", tf);
+ done("create");
+ }
+
+ Binit(&tb, f, OWRITE);
+ lp = args.linep;
+ for(n=args.nline; n>0; n--) {
+ l = *lp++;
+ lineout(&tb, l);
+ free(l->key);
+ free(l);
+ }
+ args.nline = 0;
+ Bterm(&tb);
+ close(f);
+}
+
+void
+done(char *xs)
+{
+ int i;
+
+ for(i=0; i<args.ntemp; i++)
+ remove(tempfile(i));
+ exits(xs);
+}
+
+void
+nomem(void)
+{
+ fprint(2, "sort: out of memory\n");
+ done("mem");
+}
+
+char*
+tempfile(int n)
+{
+ static char file[100];
+ static uint pid;
+ char *dir;
+
+ dir = "/tmp";
+ if(args.tname)
+ dir = args.tname;
+ if(strlen(dir) >= nelem(file)-20) {
+ fprint(2, "temp file directory name is too long: %s\n", dir);
+ done("tdir");
+ }
+
+ if(pid == 0) {
+ pid = getpid();
+ if(pid == 0) {
+ pid = time(0);
+ if(pid == 0)
+ pid = 1;
+ }
+ }
+
+ sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);
+ return file;
+}
+
+void
+mergeout(Biobuf *b)
+{
+ int n, i, f;
+ char *tf;
+ Biobuf tb;
+
+ for(i=0; i<args.ntemp; i+=n) {
+ n = args.ntemp - i;
+ if(n > Nmerge) {
+ tf = tempfile(args.ntemp);
+ args.ntemp++;
+ f = create(tf, OWRITE, 0666);
+ if(f < 0) {
+ fprint(2, "sort: create %s: %r\n", tf);
+ done("create");
+ }
+ Binit(&tb, f, OWRITE);
+
+ n = Nmerge;
+ mergefiles(i, n, &tb);
+
+ Bterm(&tb);
+ close(f);
+ } else
+ mergefiles(i, n, b);
+ }
+}
+
+void
+mergefiles(int t, int n, Biobuf *b)
+{
+ Merge *m, *mp, **mmp;
+ Key *ok;
+ Line *l;
+ char *tf;
+ int i, f, nn;
+
+ mmp = malloc(n*sizeof(*mmp));
+ mp = malloc(n*sizeof(*mp));
+ if(mmp == 0 || mp == 0)
+ nomem();
+
+ nn = 0;
+ m = mp;
+ for(i=0; i<n; i++,m++) {
+ tf = tempfile(t+i);
+ f = open(tf, OREAD);
+ if(f < 0) {
+ fprint(2, "sort: reopen %s: %r\n", tf);
+ done("open");
+ }
+ m->fd = f;
+ Binit(&m->b, f, OREAD);
+ mmp[nn] = m;
+
+ l = newline(&m->b);
+ if(l == 0)
+ continue;
+ nn++;
+ m->line = l;
+ m->key = l->key;
+ }
+
+ ok = 0;
+ for(;;) {
+ sort4(mmp, nn);
+ m = *mmp;
+ if(nn == 0)
+ break;
+ for(;;) {
+ l = m->line;
+ if(args.uflag && ok && kcmp(ok, l->key) == 0) {
+ free(l->key);
+ free(l);
+ } else {
+ lineout(b, l);
+ if(ok)
+ free(ok);
+ ok = l->key;
+ free(l);
+ }
+
+ l = newline(&m->b);
+ if(l == 0) {
+ nn--;
+ mmp[0] = mmp[nn];
+ break;
+ }
+ m->line = l;
+ m->key = l->key;
+ if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
+ break;
+ }
+ }
+ if(ok)
+ free(ok);
+
+ m = mp;
+ for(i=0; i<n; i++,m++) {
+ Bterm(&m->b);
+ close(m->fd);
+ }
+
+ free(mp);
+ free(mmp);
+}
+
+int
+kcmp(Key *ka, Key *kb)
+{
+ int n, m;
+
+ /*
+ * set n to length of smaller key
+ */
+ n = ka->klen;
+ m = kb->klen;
+ if(n > m)
+ n = m;
+ return memcmp(ka->key, kb->key, n);
+}
+
+void
+printout(Biobuf *b)
+{
+ long n;
+ Line **lp, *l;
+ Key *ok;
+
+ sort4(args.linep, args.nline);
+ lp = args.linep;
+ ok = 0;
+ for(n=args.nline; n>0; n--) {
+ l = *lp++;
+ if(args.uflag && ok && kcmp(ok, l->key) == 0)
+ continue;
+ lineout(b, l);
+ ok = l->key;
+ }
+}
+
+void
+setfield(int n, int c)
+{
+ Field *f;
+
+ f = &args.field[n];
+ switch(c) {
+ default:
+ fprint(2, "sort: unknown option: field.%C\n", c);
+ done("option");
+ case 'b': /* skip blanks */
+ f->flags |= Bflag;
+ break;
+ case 'd': /* directory order */
+ f->flags |= Dflag;
+ break;
+ case 'f': /* fold case */
+ f->flags |= Fflag;
+ break;
+ case 'g': /* floating point -n case */
+ f->flags |= Gflag;
+ break;
+ case 'i': /* ignore non-ascii */
+ f->flags |= Iflag;
+ break;
+ case 'M': /* month */
+ f->flags |= Mflag;
+ break;
+ case 'n': /* numbers */
+ f->flags |= Nflag;
+ break;
+ case 'r': /* reverse */
+ f->flags |= Rflag;
+ break;
+ case 'w': /* ignore white */
+ f->flags |= Wflag;
+ break;
+ }
+}
+
+void
+dofield(char *s, int *n1, int *n2, int off1, int off2)
+{
+ int c, n;
+
+ c = *s++;
+ if(c >= '0' && c <= '9') {
+ n = 0;
+ while(c >= '0' && c <= '9') {
+ n = n*10 + (c-'0');
+ c = *s++;
+ }
+ n -= off1; /* posix committee: rot in hell */
+ if(n < 0) {
+ fprint(2, "sort: field offset must be positive\n");
+ done("option");
+ }
+ *n1 = n;
+ }
+ if(c == '.') {
+ c = *s++;
+ if(c >= '0' && c <= '9') {
+ n = 0;
+ while(c >= '0' && c <= '9') {
+ n = n*10 + (c-'0');
+ c = *s++;
+ }
+ n -= off2;
+ if(n < 0) {
+ fprint(2, "sort: character offset must be positive\n");
+ done("option");
+ }
+ *n2 = n;
+ }
+ }
+ while(c != 0) {
+ setfield(args.nfield, c);
+ c = *s++;
+ }
+}
+
+void
+printargs(void)
+{
+ int i, n;
+ Field *f;
+ char *prefix;
+
+ fprint(2, "sort");
+ for(i=0; i<=args.nfield; i++) {
+ f = &args.field[i];
+ prefix = " -";
+ if(i) {
+ n = f->beg1;
+ if(n >= 0)
+ fprint(2, " +%d", n);
+ else
+ fprint(2, " +*");
+ n = f->beg2;
+ if(n >= 0)
+ fprint(2, ".%d", n);
+ else
+ fprint(2, ".*");
+
+ if(f->flags & B1flag)
+ fprint(2, "b");
+
+ n = f->end1;
+ if(n >= 0)
+ fprint(2, " -%d", n);
+ else
+ fprint(2, " -*");
+ n = f->end2;
+ if(n >= 0)
+ fprint(2, ".%d", n);
+ else
+ fprint(2, ".*");
+ prefix = "";
+ }
+ if(f->flags & Bflag)
+ fprint(2, "%sb", prefix);
+ if(f->flags & Dflag)
+ fprint(2, "%sd", prefix);
+ if(f->flags & Fflag)
+ fprint(2, "%sf", prefix);
+ if(f->flags & Gflag)
+ fprint(2, "%sg", prefix);
+ if(f->flags & Iflag)
+ fprint(2, "%si", prefix);
+ if(f->flags & Mflag)
+ fprint(2, "%sM", prefix);
+ if(f->flags & Nflag)
+ fprint(2, "%sn", prefix);
+ if(f->flags & Rflag)
+ fprint(2, "%sr", prefix);
+ if(f->flags & Wflag)
+ fprint(2, "%sw", prefix);
+ }
+ if(args.cflag)
+ fprint(2, " -c");
+ if(args.uflag)
+ fprint(2, " -u");
+ if(args.ofile)
+ fprint(2, " -o %s", args.ofile);
+ if(args.mline != Nline)
+ fprint(2, " -l %ld", args.mline);
+ fprint(2, "\n");
+}
+
+void
+newfield(void)
+{
+ int n;
+ Field *f;
+
+ n = args.nfield + 1;
+ if(n >= Nfield) {
+ fprint(2, "sort: too many fields specified\n");
+ done("option");
+ }
+ args.nfield = n;
+ f = &args.field[n];
+ f->beg1 = -1;
+ f->beg2 = -1;
+ f->end1 = -1;
+ f->end2 = -1;
+}
+
+void
+doargs(int argc, char *argv[])
+{
+ int i, c, hadplus;
+ char *s, *p, *q;
+ Field *f;
+
+ hadplus = 0;
+ args.mline = Nline;
+ for(i=1; i<argc; i++) {
+ s = argv[i];
+ c = *s++;
+ if(c == '-') {
+ c = *s;
+ if(c == 0) /* forced end of arg marker */
+ break;
+ argv[i] = 0; /* clobber args processed */
+ if(c == '.' || (c >= '0' && c <= '9')) {
+ if(!hadplus)
+ newfield();
+ f = &args.field[args.nfield];
+ dofield(s, &f->end1, &f->end2, 0, 0);
+ hadplus = 0;
+ continue;
+ }
+
+ while(c = *s++)
+ switch(c) {
+ case '-': /* end of options */
+ i = argc;
+ continue;
+ case 'T': /* temp directory */
+ if(*s == 0) {
+ i++;
+ if(i < argc) {
+ args.tname = argv[i];
+ argv[i] = 0;
+ }
+ } else
+ args.tname = s;
+ s = strchr(s, 0);
+ break;
+ case 'o': /* output file */
+ if(*s == 0) {
+ i++;
+ if(i < argc) {
+ args.ofile = argv[i];
+ argv[i] = 0;
+ }
+ } else
+ args.ofile = s;
+ s = strchr(s, 0);
+ break;
+ case 'k': /* posix key (what were they thinking?) */
+ p = 0;
+ if(*s == 0) {
+ i++;
+ if(i < argc) {
+ p = argv[i];
+ argv[i] = 0;
+ }
+ } else
+ p = s;
+ s = strchr(s, 0);
+ if(p == 0)
+ break;
+
+ newfield();
+ q = strchr(p, ',');
+ if(q)
+ *q++ = 0;
+ f = &args.field[args.nfield];
+ dofield(p, &f->beg1, &f->beg2, 1, 1);
+ if(f->flags & Bflag) {
+ f->flags |= B1flag;
+ f->flags &= ~Bflag;
+ }
+ if(q) {
+ dofield(q, &f->end1, &f->end2, 1, 0);
+ if(f->end2 <= 0)
+ f->end1++;
+ }
+ hadplus = 0;
+ break;
+ case 't': /* tab character */
+ if(*s == 0) {
+ i++;
+ if(i < argc) {
+ chartorune(&args.tabchar, argv[i]);
+ argv[i] = 0;
+ }
+ } else
+ s += chartorune(&args.tabchar, s);
+ if(args.tabchar == '\n') {
+ fprint(2, "aw come on, rob\n");
+ done("rob");
+ }
+ break;
+ case 'c': /* check order */
+ args.cflag = 1;
+ break;
+ case 'u': /* unique */
+ args.uflag = 1;
+ break;
+ case 'v': /* debugging noise */
+ args.vflag = 1;
+ break;
+ case 'l':
+ if(*s == 0) {
+ i++;
+ if(i < argc) {
+ args.mline = atol(argv[i]);
+ argv[i] = 0;
+ }
+ } else
+ args.mline = atol(s);
+ s = strchr(s, 0);
+ break;
+
+ case 'M': /* month */
+ case 'b': /* skip blanks */
+ case 'd': /* directory order */
+ case 'f': /* fold case */
+ case 'g': /* floating numbers */
+ case 'i': /* ignore non-ascii */
+ case 'n': /* numbers */
+ case 'r': /* reverse */
+ case 'w': /* ignore white */
+ if(args.nfield > 0)
+ fprint(2, "sort: global field set after -k\n");
+ setfield(0, c);
+ break;
+ case 'm':
+ /* option m silently ignored but required by posix */
+ break;
+ default:
+ fprint(2, "sort: unknown option: -%C\n", c);
+ done("option");
+ }
+ continue;
+ }
+ if(c == '+') {
+ argv[i] = 0; /* clobber args processed */
+ c = *s;
+ if(c == '.' || (c >= '0' && c <= '9')) {
+ newfield();
+ f = &args.field[args.nfield];
+ dofield(s, &f->beg1, &f->beg2, 0, 0);
+ if(f->flags & Bflag) {
+ f->flags |= B1flag;
+ f->flags &= ~Bflag;
+ }
+ hadplus = 1;
+ continue;
+ }
+ fprint(2, "sort: unknown option: +%C\n", c);
+ done("option");
+ }
+ args.nfile++;
+ }
+
+ for(i=0; i<=args.nfield; i++) {
+ f = &args.field[i];
+
+ /*
+ * global options apply to fields that
+ * specify no options
+ */
+ if(f->flags == 0) {
+ f->flags = args.field[0].flags;
+ if(args.field[0].flags & Bflag)
+ f->flags |= B1flag;
+ }
+
+
+ /*
+ * build buildkey specification
+ */
+ switch(f->flags & ~(Bflag|B1flag)) {
+ default:
+ fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
+ done("option");
+ case 0:
+ f->dokey = dokey_;
+ break;
+ case Rflag:
+ f->dokey = dokey_r;
+ break;
+ case Gflag:
+ case Nflag:
+ case Gflag|Nflag:
+ case Gflag|Rflag:
+ case Nflag|Rflag:
+ case Gflag|Nflag|Rflag:
+ f->dokey = dokey_gn;
+ break;
+ case Mflag:
+ case Mflag|Rflag:
+ f->dokey = dokey_m;
+ makemapm(f);
+ break;
+ case Dflag:
+ case Dflag|Fflag:
+ case Dflag|Fflag|Iflag:
+ case Dflag|Fflag|Iflag|Rflag:
+ case Dflag|Fflag|Iflag|Rflag|Wflag:
+ case Dflag|Fflag|Iflag|Wflag:
+ case Dflag|Fflag|Rflag:
+ case Dflag|Fflag|Rflag|Wflag:
+ case Dflag|Fflag|Wflag:
+ case Dflag|Iflag:
+ case Dflag|Iflag|Rflag:
+ case Dflag|Iflag|Rflag|Wflag:
+ case Dflag|Iflag|Wflag:
+ case Dflag|Rflag:
+ case Dflag|Rflag|Wflag:
+ case Dflag|Wflag:
+ case Fflag:
+ case Fflag|Iflag:
+ case Fflag|Iflag|Rflag:
+ case Fflag|Iflag|Rflag|Wflag:
+ case Fflag|Iflag|Wflag:
+ case Fflag|Rflag:
+ case Fflag|Rflag|Wflag:
+ case Fflag|Wflag:
+ case Iflag:
+ case Iflag|Rflag:
+ case Iflag|Rflag|Wflag:
+ case Iflag|Wflag:
+ case Wflag:
+ f->dokey = dokey_dfi;
+ makemapd(f);
+ break;
+ }
+ }
+
+ /*
+ * random spot checks
+ */
+ if(args.nfile > 1 && args.cflag) {
+ fprint(2, "sort: -c can have at most one input file\n");
+ done("option");
+ }
+ return;
+}
+
+uchar*
+skip(uchar *l, int n1, int n2, int bflag, int endfield)
+{
+ int i, c, tc;
+ Rune r;
+
+ if(endfield && n1 < 0)
+ return 0;
+
+ c = *l++;
+ tc = args.tabchar;
+ if(tc) {
+ if(tc < Runeself) {
+ for(i=n1; i>0; i--) {
+ while(c != tc) {
+ if(c == '\n')
+ return 0;
+ c = *l++;
+ }
+ if(!(endfield && i == 1))
+ c = *l++;
+ }
+ } else {
+ l--;
+ l += chartorune(&r, (char*)l);
+ for(i=n1; i>0; i--) {
+ while(r != tc) {
+ if(r == '\n')
+ return 0;
+ l += chartorune(&r, (char*)l);
+ }
+ if(!(endfield && i == 1))
+ l += chartorune(&r, (char*)l);
+ }
+ c = r;
+ }
+ } else {
+ for(i=n1; i>0; i--) {
+ while(c == ' ' || c == '\t')
+ c = *l++;
+ while(c != ' ' && c != '\t') {
+ if(c == '\n')
+ return 0;
+ c = *l++;
+ }
+ }
+ }
+
+ if(bflag)
+ while(c == ' ' || c == '\t')
+ c = *l++;
+
+ l--;
+ for(i=n2; i>0; i--) {
+ c = *l;
+ if(c < Runeself) {
+ if(c == '\n')
+ return 0;
+ l++;
+ continue;
+ }
+ l += chartorune(&r, (char*)l);
+ }
+ return l;
+}
+
+void
+dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
+{
+ uchar *kp;
+ int c, cl, dp;
+ int state, nzero, exp, expsign, rflag;
+
+ cl = k->klen + 3;
+ kp = k->key + cl; /* skip place for sign, exponent[2] */
+
+ nzero = 0; /* number of trailing zeros */
+ exp = 0; /* value of the exponent */
+ expsign = 0; /* sign of the exponent */
+ dp = 0x4040; /* location of decimal point */
+ rflag = f->flags&Rflag; /* xor of rflag and - sign */
+ state = NSstart;
+
+ for(;; lp++) {
+ if(lp >= lpe)
+ break;
+ c = *lp;
+
+ if(c == ' ' || c == '\t') {
+ switch(state) {
+ case NSstart:
+ case NSsign:
+ continue;
+ }
+ break;
+ }
+ if(c == '+' || c == '-') {
+ switch(state) {
+ case NSstart:
+ state = NSsign;
+ if(c == '-')
+ rflag = !rflag;
+ continue;
+ case NSexp:
+ state = NSexpsign;
+ if(c == '-')
+ expsign = 1;
+ continue;
+ }
+ break;
+ }
+ if(c == '0') {
+ switch(state) {
+ case NSdigit:
+ if(rflag)
+ c = ~c;
+ *kp++ = c;
+ cl++;
+ nzero++;
+ dp++;
+ state = NSdigit;
+ continue;
+ case NSfract:
+ if(rflag)
+ c = ~c;
+ *kp++ = c;
+ cl++;
+ nzero++;
+ state = NSfract;
+ continue;
+ case NSstart:
+ case NSsign:
+ case NSzero:
+ state = NSzero;
+ continue;
+ case NSzerofract:
+ case NSpoint:
+ dp--;
+ state = NSzerofract;
+ continue;
+ case NSexpsign:
+ case NSexp:
+ case NSexpdigit:
+ exp = exp*10 + (c - '0');
+ state = NSexpdigit;
+ continue;
+ }
+ break;
+ }
+ if(c >= '1' && c <= '9') {
+ switch(state) {
+ case NSzero:
+ case NSstart:
+ case NSsign:
+ case NSdigit:
+ if(rflag)
+ c = ~c;
+ *kp++ = c;
+ cl++;
+ nzero = 0;
+ dp++;
+ state = NSdigit;
+ continue;
+ case NSzerofract:
+ case NSpoint:
+ case NSfract:
+ if(rflag)
+ c = ~c;
+ *kp++ = c;
+ cl++;
+ nzero = 0;
+ state = NSfract;
+ continue;
+ case NSexpsign:
+ case NSexp:
+ case NSexpdigit:
+ exp = exp*10 + (c - '0');
+ state = NSexpdigit;
+ continue;
+ }
+ break;
+ }
+ if(c == '.') {
+ switch(state) {
+ case NSstart:
+ case NSsign:
+ state = NSpoint;
+ continue;
+ case NSzero:
+ state = NSzerofract;
+ continue;
+ case NSdigit:
+ state = NSfract;
+ continue;
+ }
+ break;
+ }
+ if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
+ switch(state) {
+ case NSdigit:
+ case NSfract:
+ state = NSexp;
+ continue;
+ }
+ break;
+ }
+ break;
+ }
+
+ switch(state) {
+ /*
+ * result is zero
+ */
+ case NSstart:
+ case NSsign:
+ case NSzero:
+ case NSzerofract:
+ case NSpoint:
+ kp = k->key + k->klen;
+ k->klen += 2;
+ kp[0] = 0x20; /* between + and - */
+ kp[1] = 0;
+ return;
+ /*
+ * result has exponent
+ */
+ case NSexpsign:
+ case NSexp:
+ case NSexpdigit:
+ if(expsign)
+ exp = -exp;
+ dp += exp;
+
+ /*
+ * result is fixed point number
+ */
+ case NSdigit:
+ case NSfract:
+ kp -= nzero;
+ cl -= nzero;
+ break;
+ }
+
+ /*
+ * end of number
+ */
+ c = 0;
+ if(rflag)
+ c = ~c;
+ *kp = c;
+
+ /*
+ * sign and exponent
+ */
+ c = 0x30;
+ if(rflag) {
+ c = 0x10;
+ dp = ~dp;
+ }
+ kp = k->key + k->klen;
+ kp[0] = c;
+ kp[1] = (dp >> 8);
+ kp[2] = dp;
+ k->klen = cl+1;
+}
+
+void
+dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
+{
+ uchar *kp;
+ Rune r, place[3];
+ int c, cl, pc;
+ int rflag;
+
+ rflag = f->flags&Rflag;
+ pc = 0;
+
+ cl = k->klen;
+ kp = k->key + cl;
+
+ for(;;) {
+ /*
+ * get the character
+ */
+ if(lp >= lpe)
+ break;
+ c = *lp;
+ if(c >= Runeself) {
+ lp += chartorune(&r, (char*)lp);
+ c = r;
+ } else
+ lp++;
+
+ if(c < nelem(f->mapto)) {
+ c = f->mapto[c];
+ if(c == 0)
+ continue;
+ }
+ place[pc++] = c;
+ if(pc < 3)
+ continue;
+ for(c=11; c>=0; c--)
+ if(memcmp(month[c], place, sizeof(place)) == 0)
+ break;
+ c += 10;
+ if(rflag)
+ c = ~c;
+ *kp++ = c;
+ cl++;
+ break;
+ }
+
+ c = 0;
+ if(rflag)
+ c = ~c;
+ *kp = c;
+ k->klen = cl+1;
+}
+
+void
+dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
+{
+ uchar *kp;
+ Rune r;
+ int c, cl, n, rflag;
+
+ cl = k->klen;
+ kp = k->key + cl;
+ rflag = f->flags & Rflag;
+
+ for(;;) {
+ /*
+ * get the character
+ */
+ if(lp >= lpe)
+ break;
+ c = *lp;
+ if(c >= Runeself) {
+ lp += chartorune(&r, (char*)lp);
+ c = r;
+ } else
+ lp++;
+
+ /*
+ * do the various mappings.
+ * the common case is handled
+ * completely by the table.
+ */
+ if(c != 0 && c < Runeself) {
+ c = f->mapto[c];
+ if(c) {
+ *kp++ = c;
+ cl++;
+ }
+ continue;
+ }
+
+ /*
+ * for characters out of range,
+ * the table does not do Rflag.
+ * ignore is based on mapto[255]
+ */
+ if(c != 0 && c < nelem(f->mapto)) {
+ c = f->mapto[c];
+ if(c == 0)
+ continue;
+ } else
+ if(f->mapto[nelem(f->mapto)-1] == 0)
+ continue;
+
+ /*
+ * put it in the key
+ */
+ r = c;
+ n = runetochar((char*)kp, &r);
+ kp += n;
+ cl += n;
+ if(rflag)
+ while(n > 0) {
+ kp[-n] = ~kp[-n];
+ n--;
+ }
+ }
+
+ /*
+ * end of key
+ */
+ k->klen = cl+1;
+ if(rflag) {
+ *kp = ~0;
+ return;
+ }
+ *kp = 0;
+}
+
+void
+dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f)
+{
+ int cl, n;
+ uchar *kp;
+
+ USED(f);
+ n = lpe - lp;
+ if(n < 0)
+ n = 0;
+ cl = k->klen;
+ kp = k->key + cl;
+ k->klen = cl+n+1;
+
+ lpe -= 3;
+ while(lp < lpe) {
+ kp[0] = ~lp[0];
+ kp[1] = ~lp[1];
+ kp[2] = ~lp[2];
+ kp[3] = ~lp[3];
+ kp += 4;
+ lp += 4;
+ }
+
+ lpe += 3;
+ while(lp < lpe)
+ *kp++ = ~*lp++;
+ *kp = ~0;
+}
+
+void
+dokey_(Key *k, uchar *lp, uchar *lpe, Field *f)
+{
+ int n, cl;
+ uchar *kp;
+
+ USED(f);
+ n = lpe - lp;
+ if(n < 0)
+ n = 0;
+ cl = k->klen;
+ kp = k->key + cl;
+ k->klen = cl+n+1;
+ memmove(kp, lp, n);
+ kp[n] = 0;
+}
+
+void
+buildkey(Line *l)
+{
+ Key *k;
+ uchar *lp, *lpe;
+ int ll, kl, cl, i, n;
+ Field *f;
+
+ ll = l->llen - 1;
+ kl = 0; /* allocated length */
+ cl = 0; /* current length */
+ k = 0;
+
+ for(i=1; i<=args.nfield; i++) {
+ f = &args.field[i];
+ lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);
+ if(lp == 0)
+ lp = l->line + ll;
+ lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);
+ if(lpe == 0)
+ lpe = l->line + ll;
+ n = (lpe - lp) + 1;
+ if(n <= 0)
+ n = 1;
+ if(cl+(n+4) > kl) {
+ kl = cl+(n+4);
+ k = realloc(k, sizeof(Key) +
+ (kl-1)*sizeof(k->key[0]));
+ if(k == 0)
+ nomem();
+ }
+ k->klen = cl;
+ (*f->dokey)(k, lp, lpe, f);
+ cl = k->klen;
+ }
+
+ /*
+ * global comparisons
+ */
+ if(!(args.uflag && cl > 0)) {
+ f = &args.field[0];
+ if(cl+(ll+4) > kl) {
+ kl = cl+(ll+4);
+ k = realloc(k, sizeof(Key) +
+ (kl-1)*sizeof(k->key[0]));
+ if(k == 0)
+ nomem();
+ }
+ k->klen = cl;
+ (*f->dokey)(k, l->line, l->line+ll, f);
+ cl = k->klen;
+ }
+
+ l->key = k;
+ k->klen = cl;
+
+ if(args.vflag) {
+ write(2, l->line, l->llen);
+ for(i=0; i<k->klen; i++) {
+ fprint(2, " %.2x", k->key[i]);
+ if(k->key[i] == 0x00 || k->key[i] == 0xff)
+ fprint(2, "\n");
+ }
+ }
+}
+
+void
+makemapm(Field *f)
+{
+ int i, c;
+
+ for(i=0; i<nelem(f->mapto); i++) {
+ c = 1;
+ if(i == ' ' || i == '\t')
+ c = 0;
+ if(i >= 'a' && i <= 'z')
+ c = i + ('A' - 'a');
+ if(i >= 'A' && i <= 'Z')
+ c = i;
+ f->mapto[i] = c;
+ if(args.vflag) {
+ if((i & 15) == 0)
+ fprint(2, " ");
+ fprint(2, " %.2x", c);
+ if((i & 15) == 15)
+ fprint(2, "\n");
+ }
+ }
+}
+
+void
+makemapd(Field *f)
+{
+ int i, j, c;
+
+ for(i=0; i<nelem(f->mapto); i++) {
+ c = i;
+ if(f->flags & Iflag)
+ if(c < 040 || c > 0176)
+ c = -1;
+ if((f->flags & Wflag) && c >= 0)
+ if(c == ' ' || c == '\t')
+ c = -1;
+ if((f->flags & Dflag) && c >= 0)
+ if(!(c == ' ' || c == '\t' ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9'))) {
+ for(j=0; latinmap[j]; j+=3)
+ if(c == latinmap[j+0] ||
+ c == latinmap[j+1])
+ break;
+ if(latinmap[j] == 0)
+ c = -1;
+ }
+ if((f->flags & Fflag) && c >= 0) {
+ if(c >= 'a' && c <= 'z')
+ c += 'A' - 'a';
+ for(j=0; latinmap[j]; j+=3)
+ if(c == latinmap[j+0] ||
+ c == latinmap[j+1]) {
+ c = latinmap[j+2];
+ break;
+ }
+ }
+ if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
+ c = ~c & 0xff;
+ if(c < 0)
+ c = 0;
+ f->mapto[i] = c;
+ if(args.vflag) {
+ if((i & 15) == 0)
+ fprint(2, " ");
+ fprint(2, " %.2x", c);
+ if((i & 15) == 15)
+ fprint(2, "\n");
+ }
+ }
+}
+
+int latinmap[] =
+{
+/* lcase ucase fold */
+ 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */
+ 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */
+ 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */
+ 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */
+ 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */
+ 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */
+ 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */
+ 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */
+ 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */
+ 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */
+ 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */
+ 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */
+ 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */
+ 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */
+ 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */
+ 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */
+ 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */
+ 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */
+ 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */
+ 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */
+ 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */
+ 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */
+ 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */
+ 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */
+ 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */
+ 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */
+ 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */
+ 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */
+ 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */
+ 0,
+};
+
+Rune LJAN[] = { 'J', 'A', 'N', 0 };
+Rune LFEB[] = { 'F', 'E', 'B', 0 };
+Rune LMAR[] = { 'M', 'A', 'R', 0 };
+Rune LAPR[] = { 'A', 'P', 'R', 0 };
+Rune LMAY[] = { 'M', 'A', 'Y', 0 };
+Rune LJUN[] = { 'J', 'U', 'N', 0 };
+Rune LJUL[] = { 'J', 'U', 'L', 0 };
+Rune LAUG[] = { 'A', 'U', 'G', 0 };
+Rune LSEP[] = { 'S', 'E', 'P', 0 };
+Rune LOCT[] = { 'O', 'C', 'T', 0 };
+Rune LNOV[] = { 'N', 'O', 'V', 0 };
+Rune LDEC[] = { 'D', 'E', 'C', 0 };
+
+Rune* month[12] =
+{
+ LJAN,
+ LFEB,
+ LMAR,
+ LAPR,
+ LMAY,
+ LJUN,
+ LJUL,
+ LAUG,
+ LSEP,
+ LOCT,
+ LNOV,
+ LDEC,
+};
+
+/************** radix sort ***********/
+
+enum
+{
+ Threshold = 14,
+};
+
+void rsort4(Key***, ulong, int);
+void bsort4(Key***, ulong, int);
+
+void
+sort4(void *a, ulong n)
+{
+ if(n > Threshold)
+ rsort4((Key***)a, n, 0);
+ else
+ bsort4((Key***)a, n, 0);
+}
+
+void
+rsort4(Key ***a, ulong n, int b)
+{
+ Key ***ea, ***t, ***u, **t1, **u1, *k;
+ Key ***part[257];
+ static long count[257];
+ long clist[257+257], *cp, *cp1;
+ int c, lowc, higc;
+
+ /*
+ * pass 1 over all keys,
+ * count the number of each key[b].
+ * find low count and high count.
+ */
+ lowc = 256;
+ higc = 0;
+ ea = a+n;
+ for(t=a; t<ea; t++) {
+ k = **t;
+ n = k->klen;
+ if(b >= n) {
+ count[256]++;
+ continue;
+ }
+ c = k->key[b];
+ n = count[c]++;
+ if(n == 0) {
+ if(c < lowc)
+ lowc = c;
+ if(c > higc)
+ higc = c;
+ }
+ }
+
+ /*
+ * pass 2 over all counts,
+ * put partition pointers in part[c].
+ * save compacted indexes and counts
+ * in clist[].
+ */
+ t = a;
+ n = count[256];
+ clist[0] = n;
+ part[256] = t;
+ t += n;
+
+ cp1 = clist+1;
+ cp = count+lowc;
+ for(c=lowc; c<=higc; c++,cp++) {
+ n = *cp;
+ if(n) {
+ cp1[0] = n;
+ cp1[1] = c;
+ cp1 += 2;
+ part[c] = t;
+ t += n;
+ }
+ }
+ *cp1 = 0;
+
+ /*
+ * pass 3 over all counts.
+ * chase lowest pointer in each partition
+ * around a permutation until it comes
+ * back and is stored where it started.
+ * static array, count[], should be
+ * reduced to zero entries except maybe
+ * count[256].
+ */
+ for(cp1=clist+1; cp1[0]; cp1+=2) {
+ c = cp1[1];
+ cp = count+c;
+ while(*cp) {
+ t1 = *part[c];
+ for(;;) {
+ k = *t1;
+ n = 256;
+ if(b < k->klen)
+ n = k->key[b];
+ u = part[n]++;
+ count[n]--;
+ u1 = *u;
+ *u = t1;
+ if(n == c)
+ break;
+ t1 = u1;
+ }
+ }
+ }
+
+ /*
+ * pass 4 over all partitions.
+ * call recursively.
+ */
+ b++;
+ t = a + clist[0];
+ count[256] = 0;
+ for(cp1=clist+1; n=cp1[0]; cp1+=2) {
+ if(n > Threshold)
+ rsort4(t, n, b);
+ else
+ if(n > 1)
+ bsort4(t, n, b);
+ t += n;
+ }
+}
+
+/*
+ * bubble sort to pick up
+ * the pieces.
+ */
+void
+bsort4(Key ***a, ulong n, int b)
+{
+ Key ***i, ***j, ***k, ***l, **t;
+ Key *ka, *kb;
+ int n1, n2;
+
+ l = a+n;
+ j = a;
+
+loop:
+ i = j;
+ j++;
+ if(j >= l)
+ return;
+
+ ka = **i;
+ kb = **j;
+ n1 = ka->klen - b;
+ n2 = kb->klen - b;
+ if(n1 > n2)
+ n1 = n2;
+ if(n1 <= 0)
+ goto loop;
+ n2 = ka->key[b] - kb->key[b];
+ if(n2 == 0)
+ n2 = memcmp(ka->key+b, kb->key+b, n1);
+ if(n2 <= 0)
+ goto loop;
+
+ for(;;) {
+ k = i+1;
+
+ t = *k;
+ *k = *i;
+ *i = t;
+
+ if(i <= a)
+ goto loop;
+
+ i--;
+ ka = **i;
+ kb = *t;
+ n1 = ka->klen - b;
+ n2 = kb->klen - b;
+ if(n1 > n2)
+ n1 = n2;
+ if(n1 <= 0)
+ goto loop;
+ n2 = ka->key[b] - kb->key[b];
+ if(n2 == 0)
+ n2 = memcmp(ka->key+b, kb->key+b, n1);
+ if(n2 <= 0)
+ goto loop;
+ }
+}