diff options
Diffstat (limited to 'src/cmd/spell')
-rw-r--r-- | src/cmd/spell/code.h | 28 | ||||
-rw-r--r-- | src/cmd/spell/mkfile | 12 | ||||
-rwxr-xr-x | src/cmd/spell/spell.rc | 21 | ||||
-rw-r--r-- | src/cmd/spell/sprog.c | 1381 |
4 files changed, 1442 insertions, 0 deletions
diff --git a/src/cmd/spell/code.h b/src/cmd/spell/code.h new file mode 100644 index 00000000..73fff2e2 --- /dev/null +++ b/src/cmd/spell/code.h @@ -0,0 +1,28 @@ + +/* + * affix codes + */ + +#define ED (1<<0) /* +ed, +ing */ +#define ADJ (1<<1) /* (nce)-t_ce, +ize,+al, +ness, -t+cy, +ity, +ly */ +#define NOUN (1<<2) /* +s (+es), +make, +hood, +ship +less */ +#define PROP_COLLECT (1<<3) /* +'s, +an, +ship(for -manship) +less */ +#define ACTOR (1<<4) /* +er */ +#define EST (1<<5) +#define COMP (EST|ACTOR) /* +er,+est */ +#define DONT_TOUCH (1<<6) +#define ION (1<<7) /* +ion, +or */ +#define N_AFFIX (1<<8) /* +ic, +ive, +ize, +like, +al, +ful, +ism, +ist, -t+cy, +c (maniac) */ +#define V_AFFIX (1<<9) /* +able, +ive, +ity((bility), +ment */ +#define V_IRREG (1<<10) /* +ing +es +s*/ +#define VERB (V_IRREG|ED) +#define MAN (1<<11) /* +man, +men, +women, +woman */ +#define ADV (1<<12) /* +hood, +ness */ +#define STOP (1<<14) /* stop list */ +#define NOPREF (1<<13) /* no prefix */ + +#define MONO (1<<15) /* double final consonant as in fib->fibbing */ +#define IN (1<<16) /* in- im- ir, not un- */ +#define _Y (1<<17) /* +y */ + +#define ALL (~(NOPREF|STOP|DONT_TOUCH|MONO|IN)) /*anything goes (no stop or nopref)*/ diff --git a/src/cmd/spell/mkfile b/src/cmd/spell/mkfile new file mode 100644 index 00000000..e9e600f9 --- /dev/null +++ b/src/cmd/spell/mkfile @@ -0,0 +1,12 @@ +PLAN9=../../.. +<$PLAN9/src/mkhdr + +TARG=sprog +OFILES=sprog.$O\ + +HFILES =\ + code.h\ + +SHORTLIB=bio 9 +<$PLAN9/src/mkone + diff --git a/src/cmd/spell/spell.rc b/src/cmd/spell/spell.rc new file mode 100755 index 00000000..073ab2df --- /dev/null +++ b/src/cmd/spell/spell.rc @@ -0,0 +1,21 @@ +#!/bin/rc + +spellflags=() +deroffargs=() +fflag='' +for(x){ + switch($x){ + case -[abcvx] + spellflags=($spellflags $x) + case -f + fflag=$x + case * + if(~ $fflag -f) { + spellflags=($spellflags -f $x) + fflag='' + } + if not deroffargs=($deroffargs $x) + } +} + +deroff -w $deroffargs | sort -u | aux/sprog $spellflags diff --git a/src/cmd/spell/sprog.c b/src/cmd/spell/sprog.c new file mode 100644 index 00000000..e63fbb87 --- /dev/null +++ b/src/cmd/spell/sprog.c @@ -0,0 +1,1381 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <ctype.h> +#include "code.h" + +/* fig leaves for possibly signed char quantities */ +#define ISUPPER(c) isupper((c)&0xff) +#define ISLOWER(c) islower((c)&0xff) +#define ISALPHA(c) isalpha((c)&0xff) +#define ISDIGIT(c) isdigit((c)&0xff) +#define ISVOWEL(c) voweltab[(c)&0xff] +#define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c)) +#define pair(a,b) (((a)<<8) | (b)) +#define DLEV 2 +#define DSIZ 40 + +typedef long Bits; +#define Set(h, f) ((long)(h) & (f)) + +Bits nop(char*, char*, char*, int, int); +Bits strip(char*, char*, char*, int, int); +Bits ize(char*, char*, char*, int, int); +Bits i_to_y(char*, char*, char*, int, int); +Bits ily(char*, char*, char*, int, int); +Bits subst(char*, char*, char*, int, int); +Bits CCe(char*, char*, char*, int, int); +Bits tion(char*, char*, char*, int, int); +Bits an(char*, char*, char*, int, int); +Bits s(char*, char*, char*, int, int); +Bits es(char*, char*, char*, int, int); +Bits bility(char*, char*, char*, int, int); +Bits y_to_e(char*, char*, char*, int, int); +Bits VCe(char*, char*, char*, int, int); + +Bits trypref(char*, char*, int, int); +Bits tryword(char*, char*, int, int); +Bits trysuff(char*, int, int); +Bits dict(char*, char*); +void typeprint(Bits); +void pcomma(char*); + +void ise(void); +int ordinal(void); +char* skipv(char*); +int inun(char*, Bits); +char* ztos(char*); +void readdict(char*); + +typedef struct Ptab Ptab; +struct Ptab +{ + char* s; + int flag; +}; + +typedef struct Suftab Suftab; +struct Suftab +{ + char *suf; + Bits (*p1)(char*, char*, char*, int, int); + int n1; + char *d1; + char *a1; + int flag; + int affixable; + Bits (*p2)(char*, char*, char*, int, int); + int n2; + char *d2; + char *a2; +}; + +Suftab staba[] = { + {"aibohp",subst,1,"-e+ia","",NOUN, NOUN}, + 0 +}; + +Suftab stabc[] = +{ + {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN}, + {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN}, + {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ }, + {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN }, + {"cipocs",ize,1,"-e+ic","",NOUN, ADJ }, + {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ }, + {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ }, + {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ }, + {"cibohp",subst,1,"-e+ic","",NOUN, ADJ }, + 0 +}; +Suftab stabd[] = +{ + {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"}, + {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN}, + 0 +}; +Suftab stabe[] = +{ + /* + * V_affix for comment ->commence->commentment?? + */ + {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, + {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, + {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ}, + {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ}, + {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ}, + {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP}, + {"ekil",strip,4,"","+like",N_AFFIX ,ADJ}, + 0 +}; +Suftab stabg[] = +{ + {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN}, + {"gnikam",strip,6,"","+making",NOUN,NOUN}, + {"gnipeek",strip,7,"","+keeping",NOUN,NOUN}, + {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN}, + 0 +}; +Suftab stabl[] = +{ + {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ}, + {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX}, + {"latnem",strip,2,"","+al",N_AFFIX,ADJ}, + {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN}, + {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN}, + 0 +}; +Suftab stabm[] = +{ + /* congregational + ism */ + {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN}, + {"margo",subst,-1,"-ph+m","",NOUN,NOUN}, + 0 +}; +Suftab stabn[] = +{ + {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX}, + {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX}, + {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR}, + {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, + {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX}, + {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB}, + {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX}, + {"nemow",strip,5,"","+women",MAN,PROP_COLLECT}, + {"nem",strip,3,"","+man",MAN,PROP_COLLECT}, + {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT}, + 0 +}; +Suftab stabp[] = +{ + {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, + 0 +}; +Suftab stabr[] = +{ + {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"}, + {"reyhparg",nop,0,"","",0,NOUN}, + {"reyl",nop,0,"","",0,NOUN}, + {"rekam",strip,5,"","+maker",NOUN,NOUN}, + {"repeek",strip,6,"","+keeper",NOUN,NOUN}, + {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"}, + {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y}, + {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX}, + {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX}, + 0 +}; +Suftab stabs[] = +{ + {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX}, + {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ }, + {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"}, + {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH }, + {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH }, + 0 +}; +Suftab stabt[] = +{ + {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB}, + {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" }, + {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX}, + {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP}, + 0 +}; +Suftab staby[] = +{ + {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, + {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, + {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX}, + {"ytisuo",nop,0,"","",NOUN}, + {"ytilb",nop,0,"","",0,NOUN}, + {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX }, + {"ylb",y_to_e,1,"-e+y","",ADJ,ADV}, + {"ylc",nop,0,"","",0}, + {"ylelb",nop,0,"","",0}, + {"ylelp",nop,0,"","",0}, + {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP}, + {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX}, + {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP}, + 0 +}; +Suftab stabz[] = +{ + 0 +}; +Suftab* suftab[] = +{ + staba, + stabz, + stabc, + stabd, + stabe, + stabz, + stabg, + stabz, + stabz, + stabz, + stabz, + stabl, + stabm, + stabn, + stabz, + stabp, + stabz, + stabr, + stabs, + stabt, + stabz, + stabz, + stabz, + stabz, + staby, + stabz, +}; + +Ptab ptaba[] = +{ + "anti", 0, + "auto", 0, + 0 +}; +Ptab ptabb[] = +{ + "bio", 0, + 0 +}; +Ptab ptabc[] = +{ + "counter", 0, + 0 +}; +Ptab ptabd[] = +{ + "dis", 0, + 0 +}; +Ptab ptabe[] = +{ + "electro", 0, + 0 +}; +Ptab ptabf[] = +{ + "femto", 0, + 0 +}; +Ptab ptabg[] = +{ + "geo", 0, + "giga", 0, + 0 +}; +Ptab ptabh[] = +{ + "hyper", 0, + 0 +}; +Ptab ptabi[] = +{ + "immuno", 0, + "im", IN, + "intra", 0, + "inter", 0, + "in", IN, + "ir", IN, + "iso", 0, + 0 +}; +Ptab ptabj[] = +{ + 0 +}; +Ptab ptabk[] = +{ + "kilo", 0, + 0 +}; +Ptab ptabl[] = +{ + 0 +}; +Ptab ptabm[] = +{ + "magneto", 0, + "mega", 0, + "meta", 0, + "micro", 0, + "mid", 0, + "milli", 0, + "mini", 0, + "mis", 0, + "mono", 0, + "multi", 0, + 0 +}; +Ptab ptabn[] = +{ + "nano", 0, + "neuro", 0, + "non", 0, + 0 +}; +Ptab ptabo[] = +{ + "out", 0, + "over", 0, + 0 +}; +Ptab ptabp[] = +{ + "para", 0, + "photo", 0, + "pico", 0, + "poly", 0, + "pre", 0, + "pseudo", 0, + "psycho", 0, + 0 +}; +Ptab ptabq[] = +{ + "quasi", 0, + 0 +}; +Ptab ptabr[] = +{ + "radio", 0, + "re", 0, + 0 +}; +Ptab ptabs[] = +{ + "semi", 0, + "stereo", 0, + "sub", 0, + "super", 0, + 0 +}; +Ptab ptabt[] = +{ + "tele", 0, + "tera", 0, + "thermo", 0, + 0 +}; +Ptab ptabu[] = +{ + "ultra", 0, + "under", 0, /*must precede un*/ + "un", IN, + 0 +}; +Ptab ptabv[] = +{ + 0 +}; +Ptab ptabw[] = +{ + 0 +}; +Ptab ptabx[] = +{ + 0 +}; +Ptab ptaby[] = +{ + 0 +}; +Ptab ptabz[] = +{ + 0 +}; + +Ptab* preftab[] = +{ + ptaba, + ptabb, + ptabc, + ptabd, + ptabe, + ptabf, + ptabg, + ptabh, + ptabi, + ptabj, + ptabk, + ptabl, + ptabm, + ptabn, + ptabo, + ptabp, + ptabq, + ptabr, + ptabs, + ptabt, + ptabu, + ptabv, + ptabw, + ptabx, + ptaby, + ptabz, +}; + +typedef struct { + char *mesg; + enum { NONE, SUFF, PREF} type; +} Deriv; + +int aflag; +int cflag; +int fflag; +int vflag; +int xflag; +int nflag; +char word[500]; +char* original; +Deriv emptyderiv; +Deriv deriv[DSIZ+3]; +char affix[DSIZ*10]; /* 10 is longest affix message */ +int prefcount; +int suffcount; +char* acmeid; +char space[300000]; /* must be as large as "words"+"space" in pcode run */ +Bits encode[2048]; /* must be as long as "codes" in pcode run */ +int nencode; +char voweltab[256]; +char* spacep[128*128+1]; /* pointer to words starting with 'xx' */ +Biobuf bin; +Biobuf bout; + +char* codefile = "#9/lib/amspell"; +char* brfile = "#9/lib/brspell"; +char* Usage = "usage"; + +void +main(int argc, char *argv[]) +{ + char *ep, *cp; + char *dp; + int j, i, c; + int low; + Bits h; + + Binit(&bin, 0, OREAD); + Binit(&bout, 1, OWRITE); + for(i=0; c = "aeiouyAEIOUY"[i]; i++) + voweltab[c] = 1; + while(argc > 1) { + if(argv[1][0] != '-') + break; + for(i=1; c = argv[1][i]; i++) + switch(c) { + default: + fprint(2, "usage: spell [-bcCvx] [-f file]\n"); + exits(Usage); + + case 'a': + aflag++; + continue; + + case 'b': + ise(); + if(!fflag) + codefile = brfile; + continue; + + case 'C': /* for "correct" */ + vflag++; + case 'c': /* for ocr */ + cflag++; + continue; + + case 'v': + vflag++; + continue; + + case 'x': + xflag++; + continue; + + case 'f': + if(argc <= 2) { + fprint(2, "spell: -f requires another argument\n"); + exits(Usage); + } + argv++; + argc--; + codefile = argv[1]; + fflag++; + goto brk; + } + brk: + argv++; + argc--; + } + readdict(codefile); + if(argc > 1) { + fprint(2, "usage: spell [-bcCvx] [-f file]\n"); + exits(Usage); + } + if(aflag) + cflag = vflag = 0; + + for(;;) { + affix[0] = 0; + original = Brdline(&bin, '\n'); + if(original == 0) + exits(0); + original[Blinelen(&bin)-1] = 0; + low = 0; + + if(aflag) { + acmeid = original; + while(*original != ':') + if(*original++ == 0) + exits(0); + while(*++original != ':') + if(*original == 0) + exits(0); + *original++ = 0; + } + for(ep=word,dp=original; j = *dp; ep++,dp++) { + if(ISLOWER(j)) + low++; + if(ep >= word+sizeof(word)-1) + break; + *ep = j; + } + *ep = 0; + + if(ISDIGIT(word[0]) && ordinal()) + continue; + + h = 0; + if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))) + for(cp=original+1,dp=word+1; dp<ep; dp++,cp++) + *dp = Tolower(*cp); + if(!h) + for(;;) { /* at most twice */ + if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)) + break; + if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH)) + break; + if(!ISUPPER(word[0])) + break; + cp = original; + dp = word; + while(*dp = *cp++) { + if(!low) + *dp = Tolower(*dp); + dp++; + } + word[0] = Tolower(word[0]); + } + + if(cflag) { + if(!h || Set(h,STOP)) + print("-"); + else if(!vflag) + print("+"); + else + print("%c",'0' + (suffcount>0) + + (prefcount>4? 8: 2*prefcount)); + } else if(!h || Set(h,STOP)) { + if(aflag) + Bprint(&bout, "%s:%s\n", acmeid, original); + else + Bprint(&bout, "%s\n", original); + } else if(affix[0] != 0 && affix[0] != '.') + print("%s\t%s\n", affix, original); + } + exits(0); +} + +/* strip exactly one suffix and do + * indicated routine(s), which may recursively + * strip suffixes + */ +Bits +trysuff(char* ep, int lev, int flag) +{ + Suftab *t; + char *cp, *sp; + Bits h = 0; + int initchar = ep[-1]; + + flag &= ~MONO; + lev += DLEV; + if(lev < DSIZ) { + deriv[lev] = emptyderiv; + deriv[lev-1] = emptyderiv; + } + if(!ISLOWER(initchar)) + return h; + for(t=suftab[initchar-'a']; sp=t->suf; t++) { + cp = ep; + while(*sp) + if(*--cp != *sp++) + goto next; + for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);) + ; + if(sp < word) + continue; + if(!(t->affixable & flag)) + return 0; + h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP); + if(!h && t->p2!=0) { + if(lev < DSIZ) { + deriv[lev] = emptyderiv; + deriv[lev+1] = emptyderiv; + } + h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP); + } + break; + next:; + } + return h; +} + +Bits +nop(char* ep, char* d, char* a, int lev, int flag) +{ + USED(ep); + USED(d); + USED(a); + USED(lev); + USED(flag); + return 0; +} + +Bits +cstrip(char* ep, char* d, char* a, int lev, int flag) +{ + int temp = ep[0]; + + if(ISVOWEL(temp) && ISVOWEL(ep[-1])) { + switch(pair(ep[-1],ep[0])) { + case pair('a', 'a'): + case pair('a', 'e'): + case pair('a', 'i'): + case pair('e', 'a'): + case pair('e', 'e'): + case pair('e', 'i'): + case pair('i', 'i'): + case pair('o', 'a'): + return 0; + } + } else + if(temp==ep[-1]&&temp==ep[-2]) + return 0; + return strip(ep,d,a,lev,flag); +} + +Bits +strip(char* ep, char* d, char* a, int lev, int flag) +{ + Bits h = trypref(ep, a, lev, flag); + + USED(d); + if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2])) + h = 0; + if(h) + return h; + if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) { + h = trypref(ep-1,a,lev,flag|MONO); + if(h) + return h; + } + return trysuff(ep,lev,flag); +} + +Bits +s(char* ep, char* d, char* a, int lev, int flag) +{ + if(lev > DLEV+1) + return 0; + if(*ep=='s') { + switch(ep[-1]) { + case 'y': + if(ISVOWEL(ep[-2])||ISUPPER(*word)) + break; /*says Kennedys*/ + case 'x': + case 'z': + case 's': + return 0; + case 'h': + switch(ep[-2]) { + case 'c': + case 's': + return 0; + } + } + } + return strip(ep,d,a,lev,flag); +} + +Bits +an(char* ep, char* d, char* a, int lev, int flag) +{ + USED(d); + if(!ISUPPER(*word)) /*must be proper name*/ + return 0; + return trypref(ep,a,lev,flag); +} + +Bits +ize(char* ep, char* d, char* a, int lev, int flag) +{ + int temp = ep[-1]; + Bits h; + + USED(a); + ep[-1] = 'e'; + h = strip(ep,"",d,lev,flag); + ep[-1] = temp; + return h; +} + +Bits +y_to_e(char* ep, char* d, char* a, int lev, int flag) +{ + Bits h; + int temp; + + USED(a); + switch(ep[-1]) { + case 'a': + case 'e': + case 'i': + return 0; + } + temp = *ep; + *ep++ = 'e'; + h = strip(ep,"",d,lev,flag); + ep[-1] = temp; + return h; +} + +Bits +ily(char* ep, char* d, char* a, int lev, int flag) +{ + int temp = ep[0]; + char *cp = ep; + + if(temp==ep[-1]&&temp==ep[-2]) /* sillly */ + return 0; + if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */ + while(cp>word) + if(ISVOWEL(*--cp)) /* shyness */ + return 0; + if(ep[-1]=='i') + return i_to_y(ep,d,a,lev,flag); + return cstrip(ep,d,a,lev,flag); +} + +Bits +bility(char* ep, char* d, char* a, int lev, int flag) +{ + *ep++ = 'l'; + return y_to_e(ep,d,a,lev,flag); +} + +Bits +i_to_y(char* ep, char* d, char* a, int lev, int flag) +{ + Bits h; + int temp; + + if(ISUPPER(*word)) + return 0; + if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) { + ep[-1] = 'y'; + a = d; + } + h = cstrip(ep,"",a,lev,flag); + ep[-1] = temp; + return h; +} + +Bits +es(char* ep, char* d, char* a, int lev, int flag) +{ + if(lev>DLEV) + return 0; + switch(ep[-1]) { + default: + return 0; + case 'i': + return i_to_y(ep,d,a,lev,flag); + case 'h': + switch(ep[-2]) { + default: + return 0; + case 'c': + case 's': + break; + } + case 's': + case 'z': + case 'x': + return strip(ep,d,a,lev,flag); + } +} + +Bits +subst(char* ep, char* d, char* a, int lev, int flag) +{ + char *u,*t; + Bits h; + + USED(a); + if(skipv(skipv(ep-1)) < word) + return 0; + for(t=d; *t!='+'; t++) + continue; + for(u=ep; *--t!='-';) + *--u = *t; + h = strip(ep,"",d,lev,flag); + while(*++t != '+') + continue; + while(*++t) + *u++ = *t; + return h; +} + +Bits +tion(char* ep, char* d, char* a, int lev, int flag) +{ + switch(ep[-2]) { + default: + return trypref(ep,a,lev,flag); + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + return y_to_e(ep,d,a,lev,flag); + } +} + +/* + * possible consonant-consonant-e ending + */ +Bits +CCe(char* ep, char* d, char* a, int lev, int flag) +{ + Bits h; + + switch(ep[-1]) { + case 'l': + if(ISVOWEL(ep[-2])) + break; + switch(ep[-2]) { + case 'l': + case 'r': + case 'w': + break; + default: + return y_to_e(ep,d,a,lev,flag); + } + break; + case 'c': + case 'g': + if(*ep == 'a') /* prevent -able for -eable */ + return 0; + case 's': + case 'v': + case 'z': + if(ep[-2]==ep[-1]) + break; + if(ISVOWEL(ep[-2])) + break; + case 'u': + if(h = y_to_e(ep,d,a,lev,flag)) + return h; + if(!(ep[-2]=='n' && ep[-1]=='g')) + return 0; + } + return VCe(ep,d,a,lev,flag); +} + +/* + * possible consonant-vowel-consonant-e ending + */ +Bits +VCe(char* ep, char* d, char* a, int lev, int flag) +{ + int c; + Bits h; + + c = ep[-1]; + if(c=='e') + return 0; + if(!ISVOWEL(c) && ISVOWEL(ep[-2])) { + c = *ep; + *ep++ = 'e'; + h = trypref(ep,d,lev,flag); + if(!h) + h = trysuff(ep,lev,flag); + if(h) + return h; + ep--; + *ep = c; + } + return cstrip(ep,d,a,lev,flag); +} + +Ptab* +lookuppref(uchar** wp, char* ep) +{ + Ptab *sp; + uchar *bp,*cp; + unsigned int initchar = Tolower(**wp); + + if(!ISALPHA(initchar)) + return 0; + for(sp=preftab[initchar-'a'];sp->s;sp++) { + bp = *wp; + for(cp= (uchar*)sp->s;*cp; ) + if(*bp++!=*cp++) + goto next; + for(cp=bp;cp<(uchar*)ep;cp++) + if(ISVOWEL(*cp)) { + *wp = bp; + return sp; + } + next:; + } + return 0; +} + +/* while word is not in dictionary try stripping + * prefixes. Fail if no more prefixes. + */ +Bits +trypref(char* ep, char* a, int lev, int flag) +{ + Ptab *tp; + char *bp, *cp; + char *pp; + Bits h; + char space[20]; + + if(lev<DSIZ) { + deriv[lev].mesg = a; + deriv[lev].type = *a=='.'? NONE: SUFF; + } + if(h = tryword(word,ep,lev,flag)) { + if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO)) + return h; + h = 0; + } + bp = word; + pp = space; + if(lev<DSIZ) { + deriv[lev+1].mesg = pp; + deriv[lev+1].type = 0; + } + while(tp=lookuppref((uchar**)&bp,ep)) { + *pp++ = '+'; + cp = tp->s; + while(pp<space+sizeof(space) && (*pp = *cp++)) + pp++; + deriv[lev+1].type += PREF; + h = tryword(bp,ep,lev+1,flag); + if(Set(h,NOPREF) || + ((tp->flag&IN) && inun(bp-2,h)==0)) { + h = 0; + break; + } + if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO)) + break; + h = 0; + } + if(lev < DSIZ) { + deriv[lev+1] = emptyderiv; + deriv[lev+2] = emptyderiv; + } + return h; +} + +Bits +tryword(char* bp, char* ep, int lev, int flag) +{ + int j; + Bits h = 0; + char duple[3]; + + if(ep-bp <= 1) + return h; + if(flag&MONO) { + if(lev<DSIZ) { + deriv[++lev].mesg = duple; + deriv[lev].type = SUFF; + } + duple[0] = '+'; + duple[1] = *ep; + duple[2] = 0; + } + h = dict(bp, ep); + if(vflag==0 || h==0) + return h; + /* + * when derivations are wanted, collect them + * for printing + */ + j = lev; + prefcount = suffcount = 0; + do { + if(j<DSIZ && deriv[j].type) { + strcat(affix, deriv[j].mesg); + if(deriv[j].type == SUFF) + suffcount++; + else if(deriv[j].type != NONE) + prefcount = deriv[j].type/PREF; + } + } while(--j > 0); + return h; +} + +int +inun(char* bp, Bits h) +{ + if(*bp == 'u') + return Set(h, IN) == 0; + /* *bp == 'i' */ + if(Set(h, IN) == 0) + return 0; + switch(bp[2]) { + case 'r': + return bp[1] == 'r'; + case 'm': + case 'p': + return bp[1] == 'm'; + } + return bp[1] == 'n'; +} + +char* +skipv(char *s) +{ + if(s >= word && ISVOWEL(*s)) + s--; + while(s >= word && !ISVOWEL(*s)) + s--; + return s; +} + +/* + * crummy way to Britishise + */ +void +ise(void) +{ + Suftab *p; + int i; + + for(i=0; i<26; i++) + for(p = suftab[i]; p->suf; p++) { + p->suf = ztos(p->suf); + p->d1 = ztos(p->d1); + p->a1 = ztos(p->a1); + } +} + +char* +ztos(char *as) +{ + char *s, *ds; + + for(s=as; *s; s++) + if(*s == 'z') + goto copy; + return as; + +copy: + ds = strdup(as); + for(s=ds; *s; s++) + if(*s == 'z') + *s = 's'; + return ds; +} + +Bits +dict(char* bp, char* ep) +{ + char *cp, *cp1, *w, *wp, *we; + int n, f; + + w = bp; + we = ep; + n = ep-bp; + if(n <= 1) + return NOUN; + + f = w[0] & 0x7f; + f *= 128; + f += w[1] & 0x7f; + bp = spacep[f]; + ep = spacep[f+1]; + +loop: + if(bp >= ep) { + if(xflag) + fprint(2, "=%.*s\n", utfnlen(w, n), w); + return 0; + } + /* + * find the beginning of some word in the middle + */ + cp = bp + (ep-bp)/2; + + while(cp > bp && !(*cp & 0x80)) + cp--; + while(cp > bp && (cp[-1] & 0x80)) + cp--; + + wp = w + 2; /* skip two letters */ + cp1 = cp + 2; /* skip affix code */ + for(;;) { + if(wp >= we) { + if(*cp1 & 0x80) + goto found; + else + f = 1; + break; + } + if(*cp1 & 0x80) { + f = -1; + break; + } + f = *cp1++ - *wp++; + if(f != 0) + break; + } + + if(f < 0) { + while(!(*cp1 & 0x80)) + cp1++; + bp = cp1; + goto loop; + } + ep = cp; + goto loop; + +found: + f = ((cp[0] & 0x7) << 8) | + (cp[1] & 0xff); + if(xflag) { + fprint(2, "=%.*s ", utfnlen(w, n), w); + typeprint(encode[f]); + } + return encode[f]; +} + +void +typeprint(Bits h) +{ + + pcomma(""); + if(h & NOUN) + pcomma("n"); + if(h & PROP_COLLECT) + pcomma("pc"); + if(h & VERB) { + if((h & VERB) == VERB) + pcomma("v"); + else + if((h & VERB) == V_IRREG) + pcomma("vi"); + else + if(h & ED) + pcomma("ed"); + } + if(h & ADJ) + pcomma("a"); + if(h & COMP) { + if((h & COMP) == ACTOR) + pcomma("er"); + else + pcomma("comp"); + } + if(h & DONT_TOUCH) + pcomma("d"); + if(h & N_AFFIX) + pcomma("na"); + if(h & ADV) + pcomma("adv"); + if(h & ION) + pcomma("ion"); + if(h & V_AFFIX) + pcomma("va"); + if(h & MAN) + pcomma("man"); + if(h & NOPREF) + pcomma("nopref"); + if(h & MONO) + pcomma("ms"); + if(h & IN) + pcomma("in"); + if(h & _Y) + pcomma("y"); + if(h & STOP) + pcomma("s"); + fprint(2, "\n"); +} + +void +pcomma(char *s) +{ + static int flag; + + if(*s == 0) { + flag = 0; + return; + } + if(!flag) { + fprint(2, "%s", s); + flag = 1; + } else + fprint(2, ",%s", s); +} + +/* + * is the word on of the following + * 12th teen + * 21st end in 1 + * 23rd end in 3 + * 77th default + * called knowing word[0] is a digit + */ +int +ordinal(void) +{ + char *cp = word; + static char sp[4]; + + while(ISDIGIT(*cp)) + cp++; + strncpy(sp,cp,3); + if(ISUPPER(cp[0]) && ISUPPER(cp[1])) { + sp[0] = Tolower(cp[0]); + sp[1] = Tolower(cp[1]); + } + return 0 == strncmp(sp, + cp[-2]=='1'? "th": /* out of bounds if 1 digit */ + *--cp=='1'? "st": /* harmless */ + *cp=='2'? "nd": + *cp=='3'? "rd": + "th", 3); +} + +/* + * read in the dictionary. + * format is + * { + * short nencode; + * long encode[nencode]; + * char space[*]; + * }; + * + * the encodings are a table all different + * affixes. + * the dictionary proper has 2 bytes + * that demark and then the rest of the + * word. the 2 bytes have the following + * 0x80 0x00 flag + * 0x78 0x00 count of prefix bytes + * common with prev word + * 0x07 0xff affix code + * + * all ints are big endians in the file. + */ +void +readdict(char *file) +{ + char *s, *is, *lasts, *ls; + int c, i, sp, p; + int f; + long l; + + lasts = 0; + f = open(file, 0); + if(f == -1) { + fprint(2, "cannot open %s\n", file); + exits("open"); + } + if(read(f, space, 2) != 2) + goto bad; + nencode = ((space[0]&0xff)<<8) | (space[1]&0xff); + if(read(f, space, 4*nencode) != 4*nencode) + goto bad; + s = space; + for(i=0; i<nencode; i++) { + l = (long)(s[0] & 0xff) << 24; + l |= (s[1] & 0xff) << 16; + l |= (s[2] & 0xff) << 8; + l |= s[3] & 0xff; + encode[i] = (Bits)l; + s += 4; + } + l = read(f, space, sizeof(space)); + if(l == sizeof(space)) + goto noroom; + is = space + (sizeof(space) - l); + memmove(is, space, l); + + s = space; + c = *is++ & 0xff; + sp = -1; + i = 0; + +loop: + if(s > is) + goto noroom; + if(c < 0) { + close(f); + while(sp < 128*128) + spacep[++sp] = s; + *s = 0x80; /* fence */ + return; + } + p = (c>>3) & 0xf; + *s++ = c; + *s++ = *is++ & 0xff; + if(p <= 0) + i = (*is++ & 0xff)*128; + if(p <= 1) { + if(!(*is & 0x80)) + i = i/128*128 + (*is++ & 0xff); + if(i <= sp) { + fprint(2, "the dict isnt sorted or \n"); + fprint(2, "memmove didn't work\n"); + goto bad; + } + while(sp < i) + spacep[++sp] = s-2; + } + ls = lasts; + lasts = s; + for(p-=2; p>0; p--) + *s++ = *ls++; + for(;;) { + if(is >= space+sizeof(space)) { + c = -1; + break; + } + c = *is++ & 0xff; + if(c & 0x80) + break; + *s++ = c; + } + *s = 0; + goto loop; + +bad: + fprint(2, "trouble reading %s\n", file); + exits("read"); +noroom: + fprint(2, "not enough space for dictionary\n"); + exits("space"); +} |