Various fixes, add spell.

author: rsc <devnull@localhost> 2004-03-09 12:45:12 +0000
committer: rsc <devnull@localhost> 2004-03-09 12:45:12 +0000
commit: d49a2e4801752c8a1211c7fac8cc08055a6b6fa5 (patch)
tree: 509d01c729341305f12abc34521276a4ccdb27ce /src/cmd/spell/sprog.c
parent: fb7cc74a929c86ef1c84a971d7a05deaf4d30c85 (diff)
download: plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.tar.gz
plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.tar.bz2
plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.zip
1 files changed, 1381 insertions, 0 deletions
diff --git a/src/cmd/spell/sprog.c b/src/cmd/spell/sprog.c
new file mode 100644
index 00000000..e63fbb87
--- /dev/null
+++ b/src/cmd/spell/sprog.c
@@ -0,0 +1,1381 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "code.h"
+
+/* fig leaves for possibly signed char quantities */
+#define ISUPPER(c)	isupper((c)&0xff)
+#define ISLOWER(c)	islower((c)&0xff)
+#define	ISALPHA(c)	isalpha((c)&0xff)
+#define	ISDIGIT(c)	isdigit((c)&0xff)
+#define ISVOWEL(c)	voweltab[(c)&0xff]
+#define Tolower(c)	(ISUPPER(c)? (c)-'A'+'a': (c))
+#define pair(a,b)	(((a)<<8) | (b))
+#define DLEV		2
+#define DSIZ		40
+
+typedef	long	Bits;
+#define	Set(h, f)	((long)(h) & (f))
+
+Bits 	nop(char*, char*, char*, int, int);
+Bits 	strip(char*, char*, char*, int, int);
+Bits 	ize(char*, char*, char*, int, int);
+Bits 	i_to_y(char*, char*, char*, int, int);
+Bits 	ily(char*, char*, char*, int, int);
+Bits 	subst(char*, char*, char*, int, int);
+Bits 	CCe(char*, char*, char*, int, int);
+Bits 	tion(char*, char*, char*, int, int);
+Bits 	an(char*, char*, char*, int, int);
+Bits 	s(char*, char*, char*, int, int);
+Bits 	es(char*, char*, char*, int, int);
+Bits 	bility(char*, char*, char*, int, int);
+Bits 	y_to_e(char*, char*, char*, int, int);
+Bits 	VCe(char*, char*, char*, int, int);
+
+Bits 	trypref(char*, char*, int, int);
+Bits	tryword(char*, char*, int, int);
+Bits 	trysuff(char*, int, int);
+Bits	dict(char*, char*);
+void	typeprint(Bits);
+void	pcomma(char*);
+
+void	ise(void);
+int	ordinal(void);
+char*	skipv(char*);
+int	inun(char*, Bits);
+char*	ztos(char*);
+void	readdict(char*);
+
+typedef	struct	Ptab	Ptab;
+struct	Ptab
+{
+	char*	s;
+	int	flag;
+};
+
+typedef	struct	Suftab	Suftab;
+struct	Suftab
+{
+	char	*suf;
+	Bits	(*p1)(char*, char*, char*, int, int);
+	int	n1;
+	char	*d1;
+	char	*a1;
+	int	flag;
+	int	affixable;
+	Bits	(*p2)(char*, char*, char*, int, int);
+	int	n2;
+	char	*d2;
+	char	*a2;
+};
+
+Suftab	staba[] = {
+	{"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
+	0
+};
+
+Suftab	stabc[] =
+{
+	{"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
+	{"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
+	{"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
+	{"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
+	{"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
+	{"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
+	{"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
+	{"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
+	{"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
+	0
+};
+Suftab	stabd[] =
+{
+	{"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
+	{"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
+	0
+};
+Suftab	stabe[] =
+{
+	/*
+	 * V_affix for comment ->commence->commentment??
+	 */
+	{"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
+	{"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
+	{"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
+	{"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
+	{"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
+	{"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
+	{"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
+	0
+};
+Suftab	stabg[] =
+{
+	{"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
+	{"gnikam",strip,6,"","+making",NOUN,NOUN},
+	{"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
+	{"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
+	0
+};
+Suftab	stabl[] =
+{
+	{"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
+	{"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
+	{"latnem",strip,2,"","+al",N_AFFIX,ADJ},
+	{"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
+	{"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
+	0
+};
+Suftab	stabm[] =
+{
+		/* congregational + ism */
+	{"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
+	{"margo",subst,-1,"-ph+m","",NOUN,NOUN},
+	0
+};
+Suftab	stabn[] =
+{
+	{"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
+	{"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
+	{"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
+	{"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
+	{"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
+	{"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
+	{"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
+	{"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
+	{"nem",strip,3,"","+man",MAN,PROP_COLLECT},
+	{"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
+	0
+};
+Suftab	stabp[] =
+{
+	{"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
+	0
+};
+Suftab	stabr[] =
+{
+	{"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
+	{"reyhparg",nop,0,"","",0,NOUN},
+	{"reyl",nop,0,"","",0,NOUN},
+	{"rekam",strip,5,"","+maker",NOUN,NOUN},
+	{"repeek",strip,6,"","+keeper",NOUN,NOUN},
+	{"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ,	i_to_y,2,"-y+ier","+er"},
+	{"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
+	{"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
+	{"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
+	0
+};
+Suftab	stabs[] =
+{
+	{"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
+	{"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
+	{"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH ,	es,2,"-y+ies","+es"},
+	{"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
+	{"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH  },
+	0
+};
+Suftab	stabt[] =
+{
+	{"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
+	{"tse",strip,2,"","+st",EST,DONT_TOUCH,	i_to_y,3,"-y+iest","+est" },
+	{"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
+	{"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
+	0
+};
+Suftab	staby[] =
+{
+	{"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
+	{"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
+	{"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
+	{"ytisuo",nop,0,"","",NOUN},
+	{"ytilb",nop,0,"","",0,NOUN},
+	{"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
+	{"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
+	{"ylc",nop,0,"","",0},
+	{"ylelb",nop,0,"","",0},
+	{"ylelp",nop,0,"","",0},
+	{"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
+	{"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
+	{"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
+	0
+};
+Suftab	stabz[] =
+{
+	0
+};
+Suftab*	suftab[] =
+{
+	staba,
+	stabz,
+	stabc,
+	stabd,
+	stabe,
+	stabz,
+	stabg,
+	stabz,
+	stabz,
+	stabz,
+	stabz,
+	stabl,
+	stabm,
+	stabn,
+	stabz,
+	stabp,
+	stabz,
+	stabr,
+	stabs,
+	stabt,
+	stabz,
+	stabz,
+	stabz,
+	stabz,
+	staby,
+	stabz,
+};
+
+Ptab	ptaba[] =
+{
+	"anti", 0,
+	"auto", 0,
+	0
+};
+Ptab	ptabb[] =
+{
+	"bio", 0,
+	0
+};
+Ptab	ptabc[] =
+{
+	"counter", 0,
+	0
+};
+Ptab	ptabd[] =
+{
+	"dis", 0,
+	0
+};
+Ptab	ptabe[] =
+{
+	"electro", 0,
+	0
+};
+Ptab	ptabf[] =
+{
+	"femto", 0,
+	0
+};
+Ptab	ptabg[] =
+{
+	"geo", 0,
+	"giga", 0,
+	0
+};
+Ptab	ptabh[] =
+{
+	"hyper", 0,
+	0
+};
+Ptab	ptabi[] =
+{
+	"immuno", 0,
+	"im", IN,
+	"intra", 0,
+	"inter", 0,
+	"in", IN,
+	"ir", IN,
+	"iso", 0,
+	0
+};
+Ptab	ptabj[] =
+{
+	0
+};
+Ptab	ptabk[] =
+{
+	"kilo", 0,
+	0
+};
+Ptab	ptabl[] =
+{
+	0
+};
+Ptab	ptabm[] =
+{
+	"magneto", 0,
+	"mega", 0,
+	"meta", 0,
+	"micro", 0,
+	"mid", 0,
+	"milli", 0,
+	"mini", 0,
+	"mis", 0,
+	"mono", 0,
+	"multi", 0,
+	0
+};
+Ptab	ptabn[] =
+{
+	"nano", 0,
+	"neuro", 0,
+	"non", 0,
+	0
+};
+Ptab	ptabo[] =
+{
+	"out", 0,
+	"over", 0,
+	0
+};
+Ptab	ptabp[] =
+{
+	"para", 0,
+	"photo", 0,
+	"pico", 0,
+	"poly", 0,
+	"pre", 0,
+	"pseudo", 0,
+	"psycho", 0,
+	0
+};
+Ptab	ptabq[] =
+{
+	"quasi", 0,
+	0
+};
+Ptab	ptabr[] =
+{
+	"radio", 0,
+	"re", 0,
+	0
+};
+Ptab	ptabs[] =
+{
+	"semi", 0,
+	"stereo", 0,
+	"sub", 0,
+	"super", 0,
+	0
+};
+Ptab	ptabt[] =
+{
+	"tele", 0,
+	"tera", 0,
+	"thermo", 0,
+	0
+};
+Ptab	ptabu[] =
+{
+	"ultra", 0,
+	"under", 0,	/*must precede un*/
+	"un", IN,
+	0
+};
+Ptab	ptabv[] =
+{
+	0
+};
+Ptab	ptabw[] =
+{
+	0
+};
+Ptab	ptabx[] =
+{
+	0
+};
+Ptab	ptaby[] =
+{
+	0
+};
+Ptab	ptabz[] =
+{
+	0
+};
+
+Ptab*	preftab[] =
+{
+	ptaba,
+	ptabb,
+	ptabc,
+	ptabd,
+	ptabe,
+	ptabf,
+	ptabg,
+	ptabh,
+	ptabi,
+	ptabj,
+	ptabk,
+	ptabl,
+	ptabm,
+	ptabn,
+	ptabo,
+	ptabp,
+	ptabq,
+	ptabr,
+	ptabs,
+	ptabt,
+	ptabu,
+	ptabv,
+	ptabw,
+	ptabx,
+	ptaby,
+	ptabz,
+};
+
+typedef struct {
+	char *mesg;
+	enum { NONE, SUFF, PREF} type;
+} Deriv;
+
+int	aflag;
+int	cflag;
+int	fflag;
+int	vflag;
+int	xflag;
+int 	nflag;
+char	word[500];
+char*	original;
+Deriv	emptyderiv;
+Deriv	deriv[DSIZ+3];
+char	affix[DSIZ*10];	/* 10 is longest affix message */
+int	prefcount;
+int 	suffcount;
+char*	acmeid;
+char	space[300000];	/* must be as large as "words"+"space" in pcode run */
+Bits	encode[2048];	/* must be as long as "codes" in pcode run */
+int	nencode;
+char	voweltab[256];
+char*	spacep[128*128+1];	/* pointer to words starting with 'xx' */
+Biobuf	bin;
+Biobuf	bout;
+
+char*	codefile = "#9/lib/amspell";
+char*	brfile = "#9/lib/brspell";
+char*	Usage = "usage";
+
+void
+main(int argc, char *argv[])
+{
+	char *ep, *cp;
+	char *dp;
+	int j, i, c;
+	int low;
+	Bits h;
+
+	Binit(&bin, 0, OREAD);
+	Binit(&bout, 1, OWRITE);
+	for(i=0; c = "aeiouyAEIOUY"[i]; i++)
+		voweltab[c] = 1;
+	while(argc > 1) {
+		if(argv[1][0] != '-')
+			break;
+		for(i=1; c = argv[1][i]; i++)
+		switch(c) {
+		default:
+			fprint(2, "usage: spell [-bcCvx] [-f file]\n");
+			exits(Usage);
+
+		case 'a':
+			aflag++;
+			continue;
+
+		case 'b':
+			ise();
+			if(!fflag)
+				codefile = brfile;
+			continue;
+
+		case 'C':		/* for "correct" */
+			vflag++;
+		case 'c':		/* for ocr */
+			cflag++;
+			continue;
+
+		case 'v':
+			vflag++;
+			continue;
+
+		case 'x':
+			xflag++;
+			continue;
+
+		case 'f':
+			if(argc <= 2) {
+				fprint(2, "spell: -f requires another argument\n");
+				exits(Usage);
+			}
+			argv++;
+			argc--;
+			codefile = argv[1];
+			fflag++;
+			goto brk;
+		}
+	brk:
+		argv++;
+		argc--;
+	}
+	readdict(codefile);
+	if(argc > 1) {
+		fprint(2, "usage: spell [-bcCvx] [-f file]\n");
+		exits(Usage);
+	}
+	if(aflag)
+		cflag = vflag = 0;
+
+	for(;;) {
+		affix[0] = 0;
+		original = Brdline(&bin, '\n');
+		if(original == 0)
+			exits(0);
+		original[Blinelen(&bin)-1] = 0;
+		low = 0;
+
+		if(aflag) {
+			acmeid = original;
+			while(*original != ':')
+				if(*original++ == 0)
+					exits(0);
+			while(*++original != ':')
+				if(*original == 0)
+					exits(0);
+			*original++ = 0;
+		}
+		for(ep=word,dp=original; j = *dp; ep++,dp++) {
+			if(ISLOWER(j))
+				low++;
+			if(ep >= word+sizeof(word)-1)
+				break;
+			*ep = j;
+		}
+		*ep = 0;
+
+		if(ISDIGIT(word[0]) && ordinal())
+			continue;
+
+		h = 0;
+		if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
+			for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
+				*dp = Tolower(*cp);
+		if(!h)
+		for(;;) {	/* at most twice */
+			if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
+				break;
+			if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
+				break;
+			if(!ISUPPER(word[0]))
+				break;
+			cp = original;
+			dp = word;
+			while(*dp = *cp++) {
+					if(!low)
+						*dp = Tolower(*dp);
+				dp++;
+			}
+			word[0] = Tolower(word[0]);
+		}
+
+		if(cflag) {
+			if(!h || Set(h,STOP))
+				print("-");
+			else if(!vflag)
+				print("+");
+			else 
+				print("%c",'0' + (suffcount>0) +
+				   (prefcount>4? 8: 2*prefcount));
+		} else if(!h || Set(h,STOP)) {
+			if(aflag)
+				Bprint(&bout, "%s:%s\n", acmeid, original);
+			else
+				Bprint(&bout, "%s\n", original);
+		} else if(affix[0] != 0 && affix[0] != '.')
+			print("%s\t%s\n", affix, original);
+	}
+	exits(0);
+}
+
+/*	strip exactly one suffix and do
+ *	indicated routine(s), which may recursively
+ *	strip suffixes
+ */
+Bits
+trysuff(char* ep, int lev, int flag)
+{
+	Suftab *t;
+	char *cp, *sp;
+	Bits h = 0;
+	int initchar = ep[-1];
+
+	flag &= ~MONO;
+	lev += DLEV;
+	if(lev < DSIZ) {
+		deriv[lev]  = emptyderiv;
+		deriv[lev-1] = emptyderiv;
+	}
+	if(!ISLOWER(initchar))
+		return h;
+	for(t=suftab[initchar-'a']; sp=t->suf; t++) {
+		cp = ep;
+		while(*sp)
+			if(*--cp != *sp++)
+				goto next;
+		for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
+			;
+		if(sp < word)
+			continue;
+		if(!(t->affixable & flag))
+			return 0;
+		h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
+		if(!h && t->p2!=0) {
+			if(lev < DSIZ) {
+				deriv[lev] = emptyderiv;
+				deriv[lev+1] = emptyderiv;
+			}
+			h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
+		}
+		break;
+	next:;
+	}
+	return h;
+}
+
+Bits
+nop(char* ep, char* d, char* a, int lev, int flag)
+{
+	USED(ep);
+	USED(d);
+	USED(a);
+	USED(lev);
+	USED(flag);
+	return 0;
+}
+
+Bits
+cstrip(char* ep, char* d, char* a, int lev, int flag)
+{
+	int temp = ep[0];
+
+	if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
+		switch(pair(ep[-1],ep[0])) {
+		case pair('a', 'a'):
+		case pair('a', 'e'):
+		case pair('a', 'i'):
+		case pair('e', 'a'):
+		case pair('e', 'e'):
+		case pair('e', 'i'):
+		case pair('i', 'i'):
+		case pair('o', 'a'):
+			return 0;
+		}
+	} else
+	if(temp==ep[-1]&&temp==ep[-2])
+		return 0;
+	return strip(ep,d,a,lev,flag);
+}
+
+Bits
+strip(char* ep, char* d, char* a, int lev, int flag)
+{
+	Bits h = trypref(ep, a, lev, flag);
+
+	USED(d);
+	if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
+		h = 0;
+	if(h)
+		return h;
+	if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
+		h = trypref(ep-1,a,lev,flag|MONO);
+		if(h)
+			return h;
+	}
+	return trysuff(ep,lev,flag);
+}
+
+Bits
+s(char* ep, char* d, char* a, int lev, int flag)
+{
+	if(lev > DLEV+1)
+		return 0;
+	if(*ep=='s') {
+		switch(ep[-1]) {
+		case 'y':
+			if(ISVOWEL(ep[-2])||ISUPPER(*word))
+				break;	/*says Kennedys*/
+		case 'x':
+		case 'z':
+		case 's':
+			return 0;
+		case 'h':
+			switch(ep[-2]) {
+			case 'c':
+			case 's':
+				return 0;
+			}
+		}
+	}
+	return strip(ep,d,a,lev,flag);
+}
+
+Bits
+an(char* ep, char* d, char* a, int lev, int flag)
+{
+	USED(d);
+	if(!ISUPPER(*word))	/*must be proper name*/
+		return 0;
+	return trypref(ep,a,lev,flag);
+}
+
+Bits
+ize(char* ep, char* d, char* a, int lev, int flag)
+{
+	int temp = ep[-1];
+	Bits h;
+
+	USED(a);
+	ep[-1] = 'e';
+	h = strip(ep,"",d,lev,flag);
+	ep[-1] = temp;
+	return h;
+}
+
+Bits
+y_to_e(char* ep, char* d, char* a, int lev, int flag)
+{
+	Bits h;
+	int  temp;
+
+	USED(a);
+	switch(ep[-1]) {
+	case 'a':
+	case 'e':
+	case 'i':
+		return 0;
+	}
+	temp = *ep;
+	*ep++ = 'e';
+	h = strip(ep,"",d,lev,flag);
+	ep[-1] = temp;
+	return h;
+}
+
+Bits
+ily(char* ep, char* d, char* a, int lev, int flag)
+{
+	int temp = ep[0];
+	char *cp = ep;
+
+	if(temp==ep[-1]&&temp==ep[-2])		/* sillly */
+		return 0;
+	if(*--cp=='y' && !ISVOWEL(*--cp))	/* happyly */
+		while(cp>word)
+			if(ISVOWEL(*--cp))	/* shyness */
+				return 0;
+	if(ep[-1]=='i')
+		return i_to_y(ep,d,a,lev,flag);
+	return cstrip(ep,d,a,lev,flag);
+}
+
+Bits
+bility(char* ep, char* d, char* a, int lev, int flag)
+{
+	*ep++ = 'l';
+	return y_to_e(ep,d,a,lev,flag);
+}
+
+Bits
+i_to_y(char* ep, char* d, char* a, int lev, int flag)
+{
+	Bits h;
+	int temp;
+
+	if(ISUPPER(*word))
+		return 0;
+	if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
+		ep[-1] = 'y';
+		a = d;
+	}
+	h = cstrip(ep,"",a,lev,flag);
+	ep[-1] = temp;
+	return h;
+}
+
+Bits
+es(char* ep, char* d, char* a, int lev, int flag)
+{
+	if(lev>DLEV)
+		return 0;
+	switch(ep[-1]) {
+	default:
+		return 0;
+	case 'i':
+		return i_to_y(ep,d,a,lev,flag);
+	case 'h':
+		switch(ep[-2]) {
+		default:
+			return 0;
+		case 'c':
+		case 's':
+			break;
+		}
+	case 's':
+	case 'z':
+	case 'x':
+		return strip(ep,d,a,lev,flag);
+	}
+}
+
+Bits
+subst(char* ep, char* d, char* a, int lev, int flag)
+{
+	char *u,*t;
+	Bits h;
+
+	USED(a);
+	if(skipv(skipv(ep-1)) < word)
+		return 0;
+	for(t=d; *t!='+'; t++)
+		continue;
+	for(u=ep; *--t!='-';)
+		*--u = *t;
+	h = strip(ep,"",d,lev,flag);
+	while(*++t != '+')
+		continue;
+	while(*++t)
+		*u++ = *t;
+	return h;
+}
+
+Bits
+tion(char* ep, char* d, char* a, int lev, int flag)
+{
+	switch(ep[-2]) {
+	default:
+		return trypref(ep,a,lev,flag);
+	case 'a':
+	case 'e':
+	case 'i':
+	case 'o':
+	case 'u':
+		return y_to_e(ep,d,a,lev,flag);
+	}
+}
+
+/*
+ * possible consonant-consonant-e ending
+ */
+Bits
+CCe(char* ep, char* d, char* a, int lev, int flag)
+{
+	Bits h;
+
+	switch(ep[-1]) {
+	case 'l':
+		if(ISVOWEL(ep[-2]))
+			break;
+		switch(ep[-2]) {
+		case 'l':
+		case 'r':
+		case 'w':
+			break;
+		default:
+			return y_to_e(ep,d,a,lev,flag);
+		}
+		break;
+	case 'c':
+	case 'g':
+		if(*ep == 'a')	/* prevent -able for -eable */
+			return 0;
+	case 's':
+	case 'v':
+	case 'z':
+		if(ep[-2]==ep[-1])
+			break;
+		if(ISVOWEL(ep[-2]))
+			break;
+	case 'u':
+		if(h = y_to_e(ep,d,a,lev,flag))
+			return h;
+		if(!(ep[-2]=='n' && ep[-1]=='g'))
+			return 0;
+	}
+	return VCe(ep,d,a,lev,flag);
+}
+
+/*
+ * possible consonant-vowel-consonant-e ending
+ */
+Bits
+VCe(char* ep, char* d, char* a, int lev, int flag)
+{
+	int c;
+	Bits h;
+
+	c = ep[-1];
+	if(c=='e')
+		return 0;
+	if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
+		c = *ep;
+		*ep++ = 'e';
+		h = trypref(ep,d,lev,flag);
+		if(!h)
+			h = trysuff(ep,lev,flag);
+		if(h)
+			return h;
+		ep--;
+		*ep = c;
+	}
+	return cstrip(ep,d,a,lev,flag);
+}
+
+Ptab*
+lookuppref(uchar** wp, char* ep)
+{
+	Ptab *sp;
+	uchar *bp,*cp;
+	unsigned int initchar = Tolower(**wp);
+
+	if(!ISALPHA(initchar))
+		return 0;
+	for(sp=preftab[initchar-'a'];sp->s;sp++) {
+		bp = *wp;
+		for(cp= (uchar*)sp->s;*cp; )
+			if(*bp++!=*cp++)
+				goto next;
+		for(cp=bp;cp<(uchar*)ep;cp++)
+			if(ISVOWEL(*cp)) {
+				*wp = bp;
+				return sp;
+			}
+	next:;
+	}
+	return 0;
+}
+
+/*	while word is not in dictionary try stripping
+ *	prefixes. Fail if no more prefixes.
+ */
+Bits
+trypref(char* ep, char* a, int lev, int flag)
+{
+	Ptab *tp;
+	char *bp, *cp;
+	char *pp;
+	Bits h;
+	char space[20];
+
+	if(lev<DSIZ) {
+		deriv[lev].mesg = a;
+		deriv[lev].type = *a=='.'? NONE: SUFF;
+	}
+	if(h = tryword(word,ep,lev,flag)) {
+		if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
+			return h;
+		h = 0;
+	}
+	bp = word;
+	pp = space;
+	if(lev<DSIZ) {
+		deriv[lev+1].mesg = pp;
+		deriv[lev+1].type = 0;
+	}
+	while(tp=lookuppref((uchar**)&bp,ep)) {
+		*pp++ = '+';
+		cp = tp->s;
+		while(pp<space+sizeof(space) && (*pp = *cp++))
+			pp++;
+		deriv[lev+1].type += PREF;
+		h = tryword(bp,ep,lev+1,flag);
+		if(Set(h,NOPREF) ||
+		   ((tp->flag&IN) && inun(bp-2,h)==0)) {
+			h = 0;
+			break;
+		}
+		if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
+			break;
+		h = 0;
+	}
+	if(lev < DSIZ) {
+		deriv[lev+1] = emptyderiv;
+		deriv[lev+2] = emptyderiv;
+	}
+	return h;
+}
+
+Bits
+tryword(char* bp, char* ep, int lev, int flag)
+{
+	int  j;
+	Bits h = 0;
+	char duple[3];
+
+	if(ep-bp <= 1)
+		return h;
+	if(flag&MONO) {
+		if(lev<DSIZ) {
+			deriv[++lev].mesg = duple;
+			deriv[lev].type = SUFF;
+		}
+		duple[0] = '+';
+		duple[1] = *ep;
+		duple[2] = 0;
+	}
+	h = dict(bp, ep);
+	if(vflag==0 || h==0)
+		return h;
+	/*
+	 * when derivations are wanted, collect them
+	 * for printing
+	 */
+	j = lev;
+	prefcount = suffcount = 0;
+	do {
+		if(j<DSIZ && deriv[j].type) {
+			strcat(affix, deriv[j].mesg);
+			if(deriv[j].type == SUFF)
+				suffcount++;
+			else if(deriv[j].type != NONE)
+				prefcount = deriv[j].type/PREF;
+		}
+	} while(--j > 0);
+	return h;
+}
+
+int
+inun(char* bp, Bits h)
+{
+	if(*bp == 'u')
+		return Set(h, IN) == 0;
+	/* *bp == 'i' */
+	if(Set(h, IN) == 0)
+		return 0;
+	switch(bp[2]) {
+	case 'r':
+		return bp[1] == 'r';
+	case 'm':
+	case 'p':
+		return bp[1] == 'm';
+	}
+	return bp[1] == 'n';
+}
+
+char*
+skipv(char *s)
+{
+	if(s >= word && ISVOWEL(*s))
+		s--;
+	while(s >= word && !ISVOWEL(*s))
+		s--;
+	return s;
+}
+
+/*
+ * crummy way to Britishise
+ */
+void
+ise(void)
+{
+	Suftab *p;
+	int i;
+
+	for(i=0; i<26; i++)
+		for(p = suftab[i]; p->suf; p++) {
+			p->suf = ztos(p->suf);
+			p->d1 = ztos(p->d1);
+			p->a1 = ztos(p->a1);
+		}
+}
+
+char*
+ztos(char *as)
+{
+	char *s, *ds;
+
+	for(s=as; *s; s++)
+		if(*s == 'z')
+			goto copy;
+	return as;
+
+copy:
+	ds = strdup(as);
+	for(s=ds; *s; s++)
+		if(*s == 'z')
+			*s = 's';
+	return ds;
+}
+
+Bits
+dict(char* bp, char* ep)
+{
+	char *cp, *cp1, *w, *wp, *we;
+	int n, f;
+
+	w = bp;
+	we = ep;
+	n = ep-bp;
+	if(n <= 1)
+		return NOUN;
+
+	f = w[0] & 0x7f;
+	f *= 128;
+	f += w[1] & 0x7f;
+	bp = spacep[f];
+	ep = spacep[f+1];
+
+loop:
+	if(bp >= ep) {
+		if(xflag) 
+			fprint(2, "=%.*s\n", utfnlen(w, n), w);
+		return 0;
+	}
+	/*
+	 * find the beginning of some word in the middle
+	 */
+	cp = bp + (ep-bp)/2;
+
+	while(cp > bp && !(*cp & 0x80))
+		cp--;
+	while(cp > bp && (cp[-1] & 0x80))
+		cp--;
+
+	wp = w + 2;	/* skip two letters */
+	cp1 = cp + 2;	/* skip affix code */
+	for(;;) {
+		if(wp >= we) {
+			if(*cp1 & 0x80)
+				goto found;
+			else
+				f = 1;
+			break;
+		}
+		if(*cp1 & 0x80) {
+			f = -1;
+			break;
+		}
+		f = *cp1++ - *wp++;
+		if(f != 0)
+			break;
+	}
+
+	if(f < 0) {
+		while(!(*cp1 & 0x80))
+			cp1++;
+		bp = cp1;
+		goto loop;
+	}
+	ep = cp;
+	goto loop;
+
+found:
+	f = ((cp[0] & 0x7) << 8) |
+		(cp[1] & 0xff);
+	if(xflag) {
+		fprint(2, "=%.*s ", utfnlen(w, n), w);
+		typeprint(encode[f]);
+	}
+	return encode[f];
+}
+
+void
+typeprint(Bits h)
+{
+
+	pcomma("");
+	if(h & NOUN)
+		pcomma("n");
+	if(h & PROP_COLLECT)
+		pcomma("pc");
+	if(h & VERB) {
+		if((h & VERB) == VERB)
+			pcomma("v");
+		else
+		if((h & VERB) == V_IRREG)
+			pcomma("vi");
+		else
+		if(h & ED)
+			pcomma("ed");
+	}
+	if(h & ADJ)
+		pcomma("a");
+	if(h & COMP) {
+		if((h & COMP) == ACTOR)
+			pcomma("er");
+		else
+			pcomma("comp");
+	}
+	if(h & DONT_TOUCH)
+		pcomma("d");
+	if(h & N_AFFIX)
+		pcomma("na");
+	if(h & ADV)
+		pcomma("adv");
+	if(h & ION)
+		pcomma("ion");
+	if(h & V_AFFIX)
+		pcomma("va");
+	if(h & MAN)
+		pcomma("man");
+	if(h & NOPREF)
+		pcomma("nopref");
+	if(h & MONO)
+		pcomma("ms");
+	if(h & IN)
+		pcomma("in");
+	if(h & _Y)
+		pcomma("y");
+	if(h & STOP)
+		pcomma("s");
+	fprint(2, "\n");
+}
+
+void
+pcomma(char *s)
+{
+	static int flag;
+
+	if(*s == 0) {
+		flag = 0;
+		return;
+	}
+	if(!flag) {
+		fprint(2, "%s", s);
+		flag = 1;
+	} else
+		fprint(2, ",%s", s);
+}
+
+/*
+ * is the word on of the following
+ *	12th	teen
+ *	21st	end in 1
+ *	23rd	end in 3
+ *	77th	default
+ * called knowing word[0] is a digit
+ */
+int
+ordinal(void)
+{
+	char *cp = word;
+	static char sp[4];
+
+	while(ISDIGIT(*cp))
+		cp++;
+	strncpy(sp,cp,3);
+	if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
+		sp[0] = Tolower(cp[0]);
+		sp[1] = Tolower(cp[1]);
+	}
+	return 0 == strncmp(sp,
+		cp[-2]=='1'? "th":	/* out of bounds if 1 digit */
+		*--cp=='1'? "st":	/* harmless */
+		*cp=='2'? "nd":
+		*cp=='3'? "rd":
+		"th", 3);
+}
+
+/*
+ * read in the dictionary.
+ * format is
+ * {
+ *	short	nencode;
+ *	long	encode[nencode];
+ *	char	space[*];
+ * };
+ *
+ * the encodings are a table all different
+ * affixes.
+ * the dictionary proper has 2 bytes
+ * that demark and then the rest of the
+ * word. the 2 bytes have the following
+ *	0x80 0x00	flag
+ *	0x78 0x00	count of prefix bytes
+ *			common with prev word
+ *	0x07 0xff	affix code
+ *
+ * all ints are big endians in the file.
+ */
+void
+readdict(char *file)
+{
+	char *s, *is, *lasts, *ls;
+	int c, i, sp, p;
+	int f;
+	long l;
+
+	lasts = 0;
+	f = open(file, 0);
+	if(f == -1) {
+		fprint(2, "cannot open %s\n", file);
+		exits("open");
+	}
+	if(read(f, space, 2) != 2)
+		goto bad;
+	nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
+	if(read(f, space, 4*nencode) != 4*nencode)
+		goto bad;
+	s = space;
+	for(i=0; i<nencode; i++) {
+		l = (long)(s[0] & 0xff) << 24;
+		l |= (s[1] & 0xff) << 16;
+		l |= (s[2] & 0xff) << 8;
+		l |= s[3] & 0xff;
+		encode[i] = (Bits)l;
+		s += 4;
+	}
+	l = read(f, space, sizeof(space));
+	if(l == sizeof(space))
+		goto noroom;
+	is = space + (sizeof(space) - l);
+	memmove(is, space, l);
+
+	s = space;
+	c = *is++ & 0xff;
+	sp = -1;
+	i = 0;
+
+loop:
+	if(s > is)
+		goto noroom;
+	if(c < 0) {
+		close(f);
+		while(sp < 128*128)
+			spacep[++sp] = s;
+		*s = 0x80;		/* fence */
+		return;
+	}
+	p = (c>>3) & 0xf;
+	*s++ = c;
+	*s++ = *is++ & 0xff;
+	if(p <= 0)
+		i = (*is++ & 0xff)*128;
+	if(p <= 1) {
+		if(!(*is & 0x80))
+			i = i/128*128 + (*is++ & 0xff);
+		if(i <= sp) {
+			fprint(2, "the dict isnt sorted or \n");
+			fprint(2, "memmove didn't work\n");
+			goto bad;
+		}
+		while(sp < i)
+			spacep[++sp] = s-2;
+	}
+	ls = lasts;
+	lasts = s;
+	for(p-=2; p>0; p--)
+		*s++ = *ls++;
+	for(;;) {
+		if(is >= space+sizeof(space)) {
+			c = -1;
+			break;
+		}
+		c = *is++ & 0xff;
+		if(c & 0x80)
+			break;
+		*s++ = c;
+	}
+	*s = 0;
+	goto loop;
+
+bad:
+	fprint(2, "trouble reading %s\n", file);
+	exits("read");
+noroom:
+	fprint(2, "not enough space for dictionary\n");
+	exits("space");
+}
author	rsc <devnull@localhost>	2004-03-09 12:45:12 +0000
committer	rsc <devnull@localhost>	2004-03-09 12:45:12 +0000
commit	d49a2e4801752c8a1211c7fac8cc08055a6b6fa5 (patch)
tree	509d01c729341305f12abc34521276a4ccdb27ce /src/cmd/spell/sprog.c
parent	fb7cc74a929c86ef1c84a971d7a05deaf4d30c85 (diff)
download	plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.tar.gz plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.tar.bz2 plan9port-d49a2e4801752c8a1211c7fac8cc08055a6b6fa5.zip