aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/spell
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/spell')
-rw-r--r--src/cmd/spell/code.h28
-rw-r--r--src/cmd/spell/mkfile12
-rwxr-xr-xsrc/cmd/spell/spell.rc21
-rw-r--r--src/cmd/spell/sprog.c1381
4 files changed, 1442 insertions, 0 deletions
diff --git a/src/cmd/spell/code.h b/src/cmd/spell/code.h
new file mode 100644
index 00000000..73fff2e2
--- /dev/null
+++ b/src/cmd/spell/code.h
@@ -0,0 +1,28 @@
+
+/*
+ * affix codes
+ */
+
+#define ED (1<<0) /* +ed, +ing */
+#define ADJ (1<<1) /* (nce)-t_ce, +ize,+al, +ness, -t+cy, +ity, +ly */
+#define NOUN (1<<2) /* +s (+es), +make, +hood, +ship +less */
+#define PROP_COLLECT (1<<3) /* +'s, +an, +ship(for -manship) +less */
+#define ACTOR (1<<4) /* +er */
+#define EST (1<<5)
+#define COMP (EST|ACTOR) /* +er,+est */
+#define DONT_TOUCH (1<<6)
+#define ION (1<<7) /* +ion, +or */
+#define N_AFFIX (1<<8) /* +ic, +ive, +ize, +like, +al, +ful, +ism, +ist, -t+cy, +c (maniac) */
+#define V_AFFIX (1<<9) /* +able, +ive, +ity((bility), +ment */
+#define V_IRREG (1<<10) /* +ing +es +s*/
+#define VERB (V_IRREG|ED)
+#define MAN (1<<11) /* +man, +men, +women, +woman */
+#define ADV (1<<12) /* +hood, +ness */
+#define STOP (1<<14) /* stop list */
+#define NOPREF (1<<13) /* no prefix */
+
+#define MONO (1<<15) /* double final consonant as in fib->fibbing */
+#define IN (1<<16) /* in- im- ir, not un- */
+#define _Y (1<<17) /* +y */
+
+#define ALL (~(NOPREF|STOP|DONT_TOUCH|MONO|IN)) /*anything goes (no stop or nopref)*/
diff --git a/src/cmd/spell/mkfile b/src/cmd/spell/mkfile
new file mode 100644
index 00000000..e9e600f9
--- /dev/null
+++ b/src/cmd/spell/mkfile
@@ -0,0 +1,12 @@
+PLAN9=../../..
+<$PLAN9/src/mkhdr
+
+TARG=sprog
+OFILES=sprog.$O\
+
+HFILES =\
+ code.h\
+
+SHORTLIB=bio 9
+<$PLAN9/src/mkone
+
diff --git a/src/cmd/spell/spell.rc b/src/cmd/spell/spell.rc
new file mode 100755
index 00000000..073ab2df
--- /dev/null
+++ b/src/cmd/spell/spell.rc
@@ -0,0 +1,21 @@
+#!/bin/rc
+
+spellflags=()
+deroffargs=()
+fflag=''
+for(x){
+ switch($x){
+ case -[abcvx]
+ spellflags=($spellflags $x)
+ case -f
+ fflag=$x
+ case *
+ if(~ $fflag -f) {
+ spellflags=($spellflags -f $x)
+ fflag=''
+ }
+ if not deroffargs=($deroffargs $x)
+ }
+}
+
+deroff -w $deroffargs | sort -u | aux/sprog $spellflags
diff --git a/src/cmd/spell/sprog.c b/src/cmd/spell/sprog.c
new file mode 100644
index 00000000..e63fbb87
--- /dev/null
+++ b/src/cmd/spell/sprog.c
@@ -0,0 +1,1381 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "code.h"
+
+/* fig leaves for possibly signed char quantities */
+#define ISUPPER(c) isupper((c)&0xff)
+#define ISLOWER(c) islower((c)&0xff)
+#define ISALPHA(c) isalpha((c)&0xff)
+#define ISDIGIT(c) isdigit((c)&0xff)
+#define ISVOWEL(c) voweltab[(c)&0xff]
+#define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
+#define pair(a,b) (((a)<<8) | (b))
+#define DLEV 2
+#define DSIZ 40
+
+typedef long Bits;
+#define Set(h, f) ((long)(h) & (f))
+
+Bits nop(char*, char*, char*, int, int);
+Bits strip(char*, char*, char*, int, int);
+Bits ize(char*, char*, char*, int, int);
+Bits i_to_y(char*, char*, char*, int, int);
+Bits ily(char*, char*, char*, int, int);
+Bits subst(char*, char*, char*, int, int);
+Bits CCe(char*, char*, char*, int, int);
+Bits tion(char*, char*, char*, int, int);
+Bits an(char*, char*, char*, int, int);
+Bits s(char*, char*, char*, int, int);
+Bits es(char*, char*, char*, int, int);
+Bits bility(char*, char*, char*, int, int);
+Bits y_to_e(char*, char*, char*, int, int);
+Bits VCe(char*, char*, char*, int, int);
+
+Bits trypref(char*, char*, int, int);
+Bits tryword(char*, char*, int, int);
+Bits trysuff(char*, int, int);
+Bits dict(char*, char*);
+void typeprint(Bits);
+void pcomma(char*);
+
+void ise(void);
+int ordinal(void);
+char* skipv(char*);
+int inun(char*, Bits);
+char* ztos(char*);
+void readdict(char*);
+
+typedef struct Ptab Ptab;
+struct Ptab
+{
+ char* s;
+ int flag;
+};
+
+typedef struct Suftab Suftab;
+struct Suftab
+{
+ char *suf;
+ Bits (*p1)(char*, char*, char*, int, int);
+ int n1;
+ char *d1;
+ char *a1;
+ int flag;
+ int affixable;
+ Bits (*p2)(char*, char*, char*, int, int);
+ int n2;
+ char *d2;
+ char *a2;
+};
+
+Suftab staba[] = {
+ {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
+ 0
+};
+
+Suftab stabc[] =
+{
+ {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
+ {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
+ {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
+ {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
+ {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
+ {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
+ {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
+ {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
+ {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
+ 0
+};
+Suftab stabd[] =
+{
+ {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
+ {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
+ 0
+};
+Suftab stabe[] =
+{
+ /*
+ * V_affix for comment ->commence->commentment??
+ */
+ {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
+ {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
+ {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
+ {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
+ {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
+ {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
+ {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
+ 0
+};
+Suftab stabg[] =
+{
+ {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
+ {"gnikam",strip,6,"","+making",NOUN,NOUN},
+ {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
+ {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
+ 0
+};
+Suftab stabl[] =
+{
+ {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
+ {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
+ {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
+ {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
+ {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
+ 0
+};
+Suftab stabm[] =
+{
+ /* congregational + ism */
+ {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
+ {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
+ 0
+};
+Suftab stabn[] =
+{
+ {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
+ {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
+ {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
+ {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
+ {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
+ {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
+ {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
+ {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
+ {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
+ {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
+ 0
+};
+Suftab stabp[] =
+{
+ {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
+ 0
+};
+Suftab stabr[] =
+{
+ {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
+ {"reyhparg",nop,0,"","",0,NOUN},
+ {"reyl",nop,0,"","",0,NOUN},
+ {"rekam",strip,5,"","+maker",NOUN,NOUN},
+ {"repeek",strip,6,"","+keeper",NOUN,NOUN},
+ {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
+ {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
+ {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
+ {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
+ 0
+};
+Suftab stabs[] =
+{
+ {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
+ {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
+ {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
+ {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
+ {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
+ 0
+};
+Suftab stabt[] =
+{
+ {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
+ {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
+ {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
+ {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
+ 0
+};
+Suftab staby[] =
+{
+ {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
+ {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
+ {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
+ {"ytisuo",nop,0,"","",NOUN},
+ {"ytilb",nop,0,"","",0,NOUN},
+ {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
+ {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
+ {"ylc",nop,0,"","",0},
+ {"ylelb",nop,0,"","",0},
+ {"ylelp",nop,0,"","",0},
+ {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
+ {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
+ {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
+ 0
+};
+Suftab stabz[] =
+{
+ 0
+};
+Suftab* suftab[] =
+{
+ staba,
+ stabz,
+ stabc,
+ stabd,
+ stabe,
+ stabz,
+ stabg,
+ stabz,
+ stabz,
+ stabz,
+ stabz,
+ stabl,
+ stabm,
+ stabn,
+ stabz,
+ stabp,
+ stabz,
+ stabr,
+ stabs,
+ stabt,
+ stabz,
+ stabz,
+ stabz,
+ stabz,
+ staby,
+ stabz,
+};
+
+Ptab ptaba[] =
+{
+ "anti", 0,
+ "auto", 0,
+ 0
+};
+Ptab ptabb[] =
+{
+ "bio", 0,
+ 0
+};
+Ptab ptabc[] =
+{
+ "counter", 0,
+ 0
+};
+Ptab ptabd[] =
+{
+ "dis", 0,
+ 0
+};
+Ptab ptabe[] =
+{
+ "electro", 0,
+ 0
+};
+Ptab ptabf[] =
+{
+ "femto", 0,
+ 0
+};
+Ptab ptabg[] =
+{
+ "geo", 0,
+ "giga", 0,
+ 0
+};
+Ptab ptabh[] =
+{
+ "hyper", 0,
+ 0
+};
+Ptab ptabi[] =
+{
+ "immuno", 0,
+ "im", IN,
+ "intra", 0,
+ "inter", 0,
+ "in", IN,
+ "ir", IN,
+ "iso", 0,
+ 0
+};
+Ptab ptabj[] =
+{
+ 0
+};
+Ptab ptabk[] =
+{
+ "kilo", 0,
+ 0
+};
+Ptab ptabl[] =
+{
+ 0
+};
+Ptab ptabm[] =
+{
+ "magneto", 0,
+ "mega", 0,
+ "meta", 0,
+ "micro", 0,
+ "mid", 0,
+ "milli", 0,
+ "mini", 0,
+ "mis", 0,
+ "mono", 0,
+ "multi", 0,
+ 0
+};
+Ptab ptabn[] =
+{
+ "nano", 0,
+ "neuro", 0,
+ "non", 0,
+ 0
+};
+Ptab ptabo[] =
+{
+ "out", 0,
+ "over", 0,
+ 0
+};
+Ptab ptabp[] =
+{
+ "para", 0,
+ "photo", 0,
+ "pico", 0,
+ "poly", 0,
+ "pre", 0,
+ "pseudo", 0,
+ "psycho", 0,
+ 0
+};
+Ptab ptabq[] =
+{
+ "quasi", 0,
+ 0
+};
+Ptab ptabr[] =
+{
+ "radio", 0,
+ "re", 0,
+ 0
+};
+Ptab ptabs[] =
+{
+ "semi", 0,
+ "stereo", 0,
+ "sub", 0,
+ "super", 0,
+ 0
+};
+Ptab ptabt[] =
+{
+ "tele", 0,
+ "tera", 0,
+ "thermo", 0,
+ 0
+};
+Ptab ptabu[] =
+{
+ "ultra", 0,
+ "under", 0, /*must precede un*/
+ "un", IN,
+ 0
+};
+Ptab ptabv[] =
+{
+ 0
+};
+Ptab ptabw[] =
+{
+ 0
+};
+Ptab ptabx[] =
+{
+ 0
+};
+Ptab ptaby[] =
+{
+ 0
+};
+Ptab ptabz[] =
+{
+ 0
+};
+
+Ptab* preftab[] =
+{
+ ptaba,
+ ptabb,
+ ptabc,
+ ptabd,
+ ptabe,
+ ptabf,
+ ptabg,
+ ptabh,
+ ptabi,
+ ptabj,
+ ptabk,
+ ptabl,
+ ptabm,
+ ptabn,
+ ptabo,
+ ptabp,
+ ptabq,
+ ptabr,
+ ptabs,
+ ptabt,
+ ptabu,
+ ptabv,
+ ptabw,
+ ptabx,
+ ptaby,
+ ptabz,
+};
+
+typedef struct {
+ char *mesg;
+ enum { NONE, SUFF, PREF} type;
+} Deriv;
+
+int aflag;
+int cflag;
+int fflag;
+int vflag;
+int xflag;
+int nflag;
+char word[500];
+char* original;
+Deriv emptyderiv;
+Deriv deriv[DSIZ+3];
+char affix[DSIZ*10]; /* 10 is longest affix message */
+int prefcount;
+int suffcount;
+char* acmeid;
+char space[300000]; /* must be as large as "words"+"space" in pcode run */
+Bits encode[2048]; /* must be as long as "codes" in pcode run */
+int nencode;
+char voweltab[256];
+char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
+Biobuf bin;
+Biobuf bout;
+
+char* codefile = "#9/lib/amspell";
+char* brfile = "#9/lib/brspell";
+char* Usage = "usage";
+
+void
+main(int argc, char *argv[])
+{
+ char *ep, *cp;
+ char *dp;
+ int j, i, c;
+ int low;
+ Bits h;
+
+ Binit(&bin, 0, OREAD);
+ Binit(&bout, 1, OWRITE);
+ for(i=0; c = "aeiouyAEIOUY"[i]; i++)
+ voweltab[c] = 1;
+ while(argc > 1) {
+ if(argv[1][0] != '-')
+ break;
+ for(i=1; c = argv[1][i]; i++)
+ switch(c) {
+ default:
+ fprint(2, "usage: spell [-bcCvx] [-f file]\n");
+ exits(Usage);
+
+ case 'a':
+ aflag++;
+ continue;
+
+ case 'b':
+ ise();
+ if(!fflag)
+ codefile = brfile;
+ continue;
+
+ case 'C': /* for "correct" */
+ vflag++;
+ case 'c': /* for ocr */
+ cflag++;
+ continue;
+
+ case 'v':
+ vflag++;
+ continue;
+
+ case 'x':
+ xflag++;
+ continue;
+
+ case 'f':
+ if(argc <= 2) {
+ fprint(2, "spell: -f requires another argument\n");
+ exits(Usage);
+ }
+ argv++;
+ argc--;
+ codefile = argv[1];
+ fflag++;
+ goto brk;
+ }
+ brk:
+ argv++;
+ argc--;
+ }
+ readdict(codefile);
+ if(argc > 1) {
+ fprint(2, "usage: spell [-bcCvx] [-f file]\n");
+ exits(Usage);
+ }
+ if(aflag)
+ cflag = vflag = 0;
+
+ for(;;) {
+ affix[0] = 0;
+ original = Brdline(&bin, '\n');
+ if(original == 0)
+ exits(0);
+ original[Blinelen(&bin)-1] = 0;
+ low = 0;
+
+ if(aflag) {
+ acmeid = original;
+ while(*original != ':')
+ if(*original++ == 0)
+ exits(0);
+ while(*++original != ':')
+ if(*original == 0)
+ exits(0);
+ *original++ = 0;
+ }
+ for(ep=word,dp=original; j = *dp; ep++,dp++) {
+ if(ISLOWER(j))
+ low++;
+ if(ep >= word+sizeof(word)-1)
+ break;
+ *ep = j;
+ }
+ *ep = 0;
+
+ if(ISDIGIT(word[0]) && ordinal())
+ continue;
+
+ h = 0;
+ if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
+ for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
+ *dp = Tolower(*cp);
+ if(!h)
+ for(;;) { /* at most twice */
+ if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
+ break;
+ if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
+ break;
+ if(!ISUPPER(word[0]))
+ break;
+ cp = original;
+ dp = word;
+ while(*dp = *cp++) {
+ if(!low)
+ *dp = Tolower(*dp);
+ dp++;
+ }
+ word[0] = Tolower(word[0]);
+ }
+
+ if(cflag) {
+ if(!h || Set(h,STOP))
+ print("-");
+ else if(!vflag)
+ print("+");
+ else
+ print("%c",'0' + (suffcount>0) +
+ (prefcount>4? 8: 2*prefcount));
+ } else if(!h || Set(h,STOP)) {
+ if(aflag)
+ Bprint(&bout, "%s:%s\n", acmeid, original);
+ else
+ Bprint(&bout, "%s\n", original);
+ } else if(affix[0] != 0 && affix[0] != '.')
+ print("%s\t%s\n", affix, original);
+ }
+ exits(0);
+}
+
+/* strip exactly one suffix and do
+ * indicated routine(s), which may recursively
+ * strip suffixes
+ */
+Bits
+trysuff(char* ep, int lev, int flag)
+{
+ Suftab *t;
+ char *cp, *sp;
+ Bits h = 0;
+ int initchar = ep[-1];
+
+ flag &= ~MONO;
+ lev += DLEV;
+ if(lev < DSIZ) {
+ deriv[lev] = emptyderiv;
+ deriv[lev-1] = emptyderiv;
+ }
+ if(!ISLOWER(initchar))
+ return h;
+ for(t=suftab[initchar-'a']; sp=t->suf; t++) {
+ cp = ep;
+ while(*sp)
+ if(*--cp != *sp++)
+ goto next;
+ for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
+ ;
+ if(sp < word)
+ continue;
+ if(!(t->affixable & flag))
+ return 0;
+ h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
+ if(!h && t->p2!=0) {
+ if(lev < DSIZ) {
+ deriv[lev] = emptyderiv;
+ deriv[lev+1] = emptyderiv;
+ }
+ h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
+ }
+ break;
+ next:;
+ }
+ return h;
+}
+
+Bits
+nop(char* ep, char* d, char* a, int lev, int flag)
+{
+ USED(ep);
+ USED(d);
+ USED(a);
+ USED(lev);
+ USED(flag);
+ return 0;
+}
+
+Bits
+cstrip(char* ep, char* d, char* a, int lev, int flag)
+{
+ int temp = ep[0];
+
+ if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
+ switch(pair(ep[-1],ep[0])) {
+ case pair('a', 'a'):
+ case pair('a', 'e'):
+ case pair('a', 'i'):
+ case pair('e', 'a'):
+ case pair('e', 'e'):
+ case pair('e', 'i'):
+ case pair('i', 'i'):
+ case pair('o', 'a'):
+ return 0;
+ }
+ } else
+ if(temp==ep[-1]&&temp==ep[-2])
+ return 0;
+ return strip(ep,d,a,lev,flag);
+}
+
+Bits
+strip(char* ep, char* d, char* a, int lev, int flag)
+{
+ Bits h = trypref(ep, a, lev, flag);
+
+ USED(d);
+ if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
+ h = 0;
+ if(h)
+ return h;
+ if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
+ h = trypref(ep-1,a,lev,flag|MONO);
+ if(h)
+ return h;
+ }
+ return trysuff(ep,lev,flag);
+}
+
+Bits
+s(char* ep, char* d, char* a, int lev, int flag)
+{
+ if(lev > DLEV+1)
+ return 0;
+ if(*ep=='s') {
+ switch(ep[-1]) {
+ case 'y':
+ if(ISVOWEL(ep[-2])||ISUPPER(*word))
+ break; /*says Kennedys*/
+ case 'x':
+ case 'z':
+ case 's':
+ return 0;
+ case 'h':
+ switch(ep[-2]) {
+ case 'c':
+ case 's':
+ return 0;
+ }
+ }
+ }
+ return strip(ep,d,a,lev,flag);
+}
+
+Bits
+an(char* ep, char* d, char* a, int lev, int flag)
+{
+ USED(d);
+ if(!ISUPPER(*word)) /*must be proper name*/
+ return 0;
+ return trypref(ep,a,lev,flag);
+}
+
+Bits
+ize(char* ep, char* d, char* a, int lev, int flag)
+{
+ int temp = ep[-1];
+ Bits h;
+
+ USED(a);
+ ep[-1] = 'e';
+ h = strip(ep,"",d,lev,flag);
+ ep[-1] = temp;
+ return h;
+}
+
+Bits
+y_to_e(char* ep, char* d, char* a, int lev, int flag)
+{
+ Bits h;
+ int temp;
+
+ USED(a);
+ switch(ep[-1]) {
+ case 'a':
+ case 'e':
+ case 'i':
+ return 0;
+ }
+ temp = *ep;
+ *ep++ = 'e';
+ h = strip(ep,"",d,lev,flag);
+ ep[-1] = temp;
+ return h;
+}
+
+Bits
+ily(char* ep, char* d, char* a, int lev, int flag)
+{
+ int temp = ep[0];
+ char *cp = ep;
+
+ if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
+ return 0;
+ if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
+ while(cp>word)
+ if(ISVOWEL(*--cp)) /* shyness */
+ return 0;
+ if(ep[-1]=='i')
+ return i_to_y(ep,d,a,lev,flag);
+ return cstrip(ep,d,a,lev,flag);
+}
+
+Bits
+bility(char* ep, char* d, char* a, int lev, int flag)
+{
+ *ep++ = 'l';
+ return y_to_e(ep,d,a,lev,flag);
+}
+
+Bits
+i_to_y(char* ep, char* d, char* a, int lev, int flag)
+{
+ Bits h;
+ int temp;
+
+ if(ISUPPER(*word))
+ return 0;
+ if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
+ ep[-1] = 'y';
+ a = d;
+ }
+ h = cstrip(ep,"",a,lev,flag);
+ ep[-1] = temp;
+ return h;
+}
+
+Bits
+es(char* ep, char* d, char* a, int lev, int flag)
+{
+ if(lev>DLEV)
+ return 0;
+ switch(ep[-1]) {
+ default:
+ return 0;
+ case 'i':
+ return i_to_y(ep,d,a,lev,flag);
+ case 'h':
+ switch(ep[-2]) {
+ default:
+ return 0;
+ case 'c':
+ case 's':
+ break;
+ }
+ case 's':
+ case 'z':
+ case 'x':
+ return strip(ep,d,a,lev,flag);
+ }
+}
+
+Bits
+subst(char* ep, char* d, char* a, int lev, int flag)
+{
+ char *u,*t;
+ Bits h;
+
+ USED(a);
+ if(skipv(skipv(ep-1)) < word)
+ return 0;
+ for(t=d; *t!='+'; t++)
+ continue;
+ for(u=ep; *--t!='-';)
+ *--u = *t;
+ h = strip(ep,"",d,lev,flag);
+ while(*++t != '+')
+ continue;
+ while(*++t)
+ *u++ = *t;
+ return h;
+}
+
+Bits
+tion(char* ep, char* d, char* a, int lev, int flag)
+{
+ switch(ep[-2]) {
+ default:
+ return trypref(ep,a,lev,flag);
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ return y_to_e(ep,d,a,lev,flag);
+ }
+}
+
+/*
+ * possible consonant-consonant-e ending
+ */
+Bits
+CCe(char* ep, char* d, char* a, int lev, int flag)
+{
+ Bits h;
+
+ switch(ep[-1]) {
+ case 'l':
+ if(ISVOWEL(ep[-2]))
+ break;
+ switch(ep[-2]) {
+ case 'l':
+ case 'r':
+ case 'w':
+ break;
+ default:
+ return y_to_e(ep,d,a,lev,flag);
+ }
+ break;
+ case 'c':
+ case 'g':
+ if(*ep == 'a') /* prevent -able for -eable */
+ return 0;
+ case 's':
+ case 'v':
+ case 'z':
+ if(ep[-2]==ep[-1])
+ break;
+ if(ISVOWEL(ep[-2]))
+ break;
+ case 'u':
+ if(h = y_to_e(ep,d,a,lev,flag))
+ return h;
+ if(!(ep[-2]=='n' && ep[-1]=='g'))
+ return 0;
+ }
+ return VCe(ep,d,a,lev,flag);
+}
+
+/*
+ * possible consonant-vowel-consonant-e ending
+ */
+Bits
+VCe(char* ep, char* d, char* a, int lev, int flag)
+{
+ int c;
+ Bits h;
+
+ c = ep[-1];
+ if(c=='e')
+ return 0;
+ if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
+ c = *ep;
+ *ep++ = 'e';
+ h = trypref(ep,d,lev,flag);
+ if(!h)
+ h = trysuff(ep,lev,flag);
+ if(h)
+ return h;
+ ep--;
+ *ep = c;
+ }
+ return cstrip(ep,d,a,lev,flag);
+}
+
+Ptab*
+lookuppref(uchar** wp, char* ep)
+{
+ Ptab *sp;
+ uchar *bp,*cp;
+ unsigned int initchar = Tolower(**wp);
+
+ if(!ISALPHA(initchar))
+ return 0;
+ for(sp=preftab[initchar-'a'];sp->s;sp++) {
+ bp = *wp;
+ for(cp= (uchar*)sp->s;*cp; )
+ if(*bp++!=*cp++)
+ goto next;
+ for(cp=bp;cp<(uchar*)ep;cp++)
+ if(ISVOWEL(*cp)) {
+ *wp = bp;
+ return sp;
+ }
+ next:;
+ }
+ return 0;
+}
+
+/* while word is not in dictionary try stripping
+ * prefixes. Fail if no more prefixes.
+ */
+Bits
+trypref(char* ep, char* a, int lev, int flag)
+{
+ Ptab *tp;
+ char *bp, *cp;
+ char *pp;
+ Bits h;
+ char space[20];
+
+ if(lev<DSIZ) {
+ deriv[lev].mesg = a;
+ deriv[lev].type = *a=='.'? NONE: SUFF;
+ }
+ if(h = tryword(word,ep,lev,flag)) {
+ if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
+ return h;
+ h = 0;
+ }
+ bp = word;
+ pp = space;
+ if(lev<DSIZ) {
+ deriv[lev+1].mesg = pp;
+ deriv[lev+1].type = 0;
+ }
+ while(tp=lookuppref((uchar**)&bp,ep)) {
+ *pp++ = '+';
+ cp = tp->s;
+ while(pp<space+sizeof(space) && (*pp = *cp++))
+ pp++;
+ deriv[lev+1].type += PREF;
+ h = tryword(bp,ep,lev+1,flag);
+ if(Set(h,NOPREF) ||
+ ((tp->flag&IN) && inun(bp-2,h)==0)) {
+ h = 0;
+ break;
+ }
+ if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
+ break;
+ h = 0;
+ }
+ if(lev < DSIZ) {
+ deriv[lev+1] = emptyderiv;
+ deriv[lev+2] = emptyderiv;
+ }
+ return h;
+}
+
+Bits
+tryword(char* bp, char* ep, int lev, int flag)
+{
+ int j;
+ Bits h = 0;
+ char duple[3];
+
+ if(ep-bp <= 1)
+ return h;
+ if(flag&MONO) {
+ if(lev<DSIZ) {
+ deriv[++lev].mesg = duple;
+ deriv[lev].type = SUFF;
+ }
+ duple[0] = '+';
+ duple[1] = *ep;
+ duple[2] = 0;
+ }
+ h = dict(bp, ep);
+ if(vflag==0 || h==0)
+ return h;
+ /*
+ * when derivations are wanted, collect them
+ * for printing
+ */
+ j = lev;
+ prefcount = suffcount = 0;
+ do {
+ if(j<DSIZ && deriv[j].type) {
+ strcat(affix, deriv[j].mesg);
+ if(deriv[j].type == SUFF)
+ suffcount++;
+ else if(deriv[j].type != NONE)
+ prefcount = deriv[j].type/PREF;
+ }
+ } while(--j > 0);
+ return h;
+}
+
+int
+inun(char* bp, Bits h)
+{
+ if(*bp == 'u')
+ return Set(h, IN) == 0;
+ /* *bp == 'i' */
+ if(Set(h, IN) == 0)
+ return 0;
+ switch(bp[2]) {
+ case 'r':
+ return bp[1] == 'r';
+ case 'm':
+ case 'p':
+ return bp[1] == 'm';
+ }
+ return bp[1] == 'n';
+}
+
+char*
+skipv(char *s)
+{
+ if(s >= word && ISVOWEL(*s))
+ s--;
+ while(s >= word && !ISVOWEL(*s))
+ s--;
+ return s;
+}
+
+/*
+ * crummy way to Britishise
+ */
+void
+ise(void)
+{
+ Suftab *p;
+ int i;
+
+ for(i=0; i<26; i++)
+ for(p = suftab[i]; p->suf; p++) {
+ p->suf = ztos(p->suf);
+ p->d1 = ztos(p->d1);
+ p->a1 = ztos(p->a1);
+ }
+}
+
+char*
+ztos(char *as)
+{
+ char *s, *ds;
+
+ for(s=as; *s; s++)
+ if(*s == 'z')
+ goto copy;
+ return as;
+
+copy:
+ ds = strdup(as);
+ for(s=ds; *s; s++)
+ if(*s == 'z')
+ *s = 's';
+ return ds;
+}
+
+Bits
+dict(char* bp, char* ep)
+{
+ char *cp, *cp1, *w, *wp, *we;
+ int n, f;
+
+ w = bp;
+ we = ep;
+ n = ep-bp;
+ if(n <= 1)
+ return NOUN;
+
+ f = w[0] & 0x7f;
+ f *= 128;
+ f += w[1] & 0x7f;
+ bp = spacep[f];
+ ep = spacep[f+1];
+
+loop:
+ if(bp >= ep) {
+ if(xflag)
+ fprint(2, "=%.*s\n", utfnlen(w, n), w);
+ return 0;
+ }
+ /*
+ * find the beginning of some word in the middle
+ */
+ cp = bp + (ep-bp)/2;
+
+ while(cp > bp && !(*cp & 0x80))
+ cp--;
+ while(cp > bp && (cp[-1] & 0x80))
+ cp--;
+
+ wp = w + 2; /* skip two letters */
+ cp1 = cp + 2; /* skip affix code */
+ for(;;) {
+ if(wp >= we) {
+ if(*cp1 & 0x80)
+ goto found;
+ else
+ f = 1;
+ break;
+ }
+ if(*cp1 & 0x80) {
+ f = -1;
+ break;
+ }
+ f = *cp1++ - *wp++;
+ if(f != 0)
+ break;
+ }
+
+ if(f < 0) {
+ while(!(*cp1 & 0x80))
+ cp1++;
+ bp = cp1;
+ goto loop;
+ }
+ ep = cp;
+ goto loop;
+
+found:
+ f = ((cp[0] & 0x7) << 8) |
+ (cp[1] & 0xff);
+ if(xflag) {
+ fprint(2, "=%.*s ", utfnlen(w, n), w);
+ typeprint(encode[f]);
+ }
+ return encode[f];
+}
+
+void
+typeprint(Bits h)
+{
+
+ pcomma("");
+ if(h & NOUN)
+ pcomma("n");
+ if(h & PROP_COLLECT)
+ pcomma("pc");
+ if(h & VERB) {
+ if((h & VERB) == VERB)
+ pcomma("v");
+ else
+ if((h & VERB) == V_IRREG)
+ pcomma("vi");
+ else
+ if(h & ED)
+ pcomma("ed");
+ }
+ if(h & ADJ)
+ pcomma("a");
+ if(h & COMP) {
+ if((h & COMP) == ACTOR)
+ pcomma("er");
+ else
+ pcomma("comp");
+ }
+ if(h & DONT_TOUCH)
+ pcomma("d");
+ if(h & N_AFFIX)
+ pcomma("na");
+ if(h & ADV)
+ pcomma("adv");
+ if(h & ION)
+ pcomma("ion");
+ if(h & V_AFFIX)
+ pcomma("va");
+ if(h & MAN)
+ pcomma("man");
+ if(h & NOPREF)
+ pcomma("nopref");
+ if(h & MONO)
+ pcomma("ms");
+ if(h & IN)
+ pcomma("in");
+ if(h & _Y)
+ pcomma("y");
+ if(h & STOP)
+ pcomma("s");
+ fprint(2, "\n");
+}
+
+void
+pcomma(char *s)
+{
+ static int flag;
+
+ if(*s == 0) {
+ flag = 0;
+ return;
+ }
+ if(!flag) {
+ fprint(2, "%s", s);
+ flag = 1;
+ } else
+ fprint(2, ",%s", s);
+}
+
+/*
+ * is the word on of the following
+ * 12th teen
+ * 21st end in 1
+ * 23rd end in 3
+ * 77th default
+ * called knowing word[0] is a digit
+ */
+int
+ordinal(void)
+{
+ char *cp = word;
+ static char sp[4];
+
+ while(ISDIGIT(*cp))
+ cp++;
+ strncpy(sp,cp,3);
+ if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
+ sp[0] = Tolower(cp[0]);
+ sp[1] = Tolower(cp[1]);
+ }
+ return 0 == strncmp(sp,
+ cp[-2]=='1'? "th": /* out of bounds if 1 digit */
+ *--cp=='1'? "st": /* harmless */
+ *cp=='2'? "nd":
+ *cp=='3'? "rd":
+ "th", 3);
+}
+
+/*
+ * read in the dictionary.
+ * format is
+ * {
+ * short nencode;
+ * long encode[nencode];
+ * char space[*];
+ * };
+ *
+ * the encodings are a table all different
+ * affixes.
+ * the dictionary proper has 2 bytes
+ * that demark and then the rest of the
+ * word. the 2 bytes have the following
+ * 0x80 0x00 flag
+ * 0x78 0x00 count of prefix bytes
+ * common with prev word
+ * 0x07 0xff affix code
+ *
+ * all ints are big endians in the file.
+ */
+void
+readdict(char *file)
+{
+ char *s, *is, *lasts, *ls;
+ int c, i, sp, p;
+ int f;
+ long l;
+
+ lasts = 0;
+ f = open(file, 0);
+ if(f == -1) {
+ fprint(2, "cannot open %s\n", file);
+ exits("open");
+ }
+ if(read(f, space, 2) != 2)
+ goto bad;
+ nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
+ if(read(f, space, 4*nencode) != 4*nencode)
+ goto bad;
+ s = space;
+ for(i=0; i<nencode; i++) {
+ l = (long)(s[0] & 0xff) << 24;
+ l |= (s[1] & 0xff) << 16;
+ l |= (s[2] & 0xff) << 8;
+ l |= s[3] & 0xff;
+ encode[i] = (Bits)l;
+ s += 4;
+ }
+ l = read(f, space, sizeof(space));
+ if(l == sizeof(space))
+ goto noroom;
+ is = space + (sizeof(space) - l);
+ memmove(is, space, l);
+
+ s = space;
+ c = *is++ & 0xff;
+ sp = -1;
+ i = 0;
+
+loop:
+ if(s > is)
+ goto noroom;
+ if(c < 0) {
+ close(f);
+ while(sp < 128*128)
+ spacep[++sp] = s;
+ *s = 0x80; /* fence */
+ return;
+ }
+ p = (c>>3) & 0xf;
+ *s++ = c;
+ *s++ = *is++ & 0xff;
+ if(p <= 0)
+ i = (*is++ & 0xff)*128;
+ if(p <= 1) {
+ if(!(*is & 0x80))
+ i = i/128*128 + (*is++ & 0xff);
+ if(i <= sp) {
+ fprint(2, "the dict isnt sorted or \n");
+ fprint(2, "memmove didn't work\n");
+ goto bad;
+ }
+ while(sp < i)
+ spacep[++sp] = s-2;
+ }
+ ls = lasts;
+ lasts = s;
+ for(p-=2; p>0; p--)
+ *s++ = *ls++;
+ for(;;) {
+ if(is >= space+sizeof(space)) {
+ c = -1;
+ break;
+ }
+ c = *is++ & 0xff;
+ if(c & 0x80)
+ break;
+ *s++ = c;
+ }
+ *s = 0;
+ goto loop;
+
+bad:
+ fprint(2, "trouble reading %s\n", file);
+ exits("read");
+noroom:
+ fprint(2, "not enough space for dictionary\n");
+ exits("space");
+}