From 08708877939323c1e1cb87210193ec25fc472ff7 Mon Sep 17 00:00:00 2001 From: rsc Date: Tue, 25 Nov 2003 03:37:45 +0000 Subject: add dict --- src/cmd/dict/utils.c | 577 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 577 insertions(+) create mode 100644 src/cmd/dict/utils.c (limited to 'src/cmd/dict/utils.c') diff --git a/src/cmd/dict/utils.c b/src/cmd/dict/utils.c new file mode 100644 index 00000000..8e4db9e9 --- /dev/null +++ b/src/cmd/dict/utils.c @@ -0,0 +1,577 @@ +#include +#include +#include +#include "dict.h" + +Dict dicts[] = { + {"oed", "Oxford English Dictionary, 2nd Ed.", + "dict/oed2", "dict/oed2index", + oednextoff, oedprintentry, oedprintkey}, + {"ahd", "American Heritage Dictionary, 2nd College Ed.", + "ahd/DICT.DB", "ahd/index", + ahdnextoff, ahdprintentry, ahdprintkey}, + {"pgw", "Project Gutenberg Webster Dictionary", + "dict/pgw", "dict/pgwindex", + pgwnextoff, pgwprintentry, pgwprintkey}, + {"thesaurus", "Collins Thesaurus", + "dict/thesaurus", "dict/thesindex", + thesnextoff, thesprintentry, thesprintkey}, + + {"ce", "Gendai Chinese->English", + "dict/world/sansdata/sandic24.dat", + "dict/world/sansdata/ceindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ceh", "Gendai Chinese->English (Hanzi index)", + "dict/world/sansdata/sandic24.dat", + "dict/world/sansdata/cehindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ec", "Gendai English->Chinese", + "dict/world/sansdata/sandic24.dat", + "dict/world/sansdata/ecindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"dae", "Gyldendal Danish->English", + "dict/world/gylddata/sandic30.dat", + "dict/world/gylddata/daeindex", + worldnextoff, worldprintentry, worldprintkey}, + {"eda", "Gyldendal English->Danish", + "dict/world/gylddata/sandic29.dat", + "dict/world/gylddata/edaindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"due", "Wolters-Noordhoff Dutch->English", + "dict/world/woltdata/sandic07.dat", + "dict/world/woltdata/deindex", + worldnextoff, worldprintentry, worldprintkey}, + {"edu", "Wolters-Noordhoff English->Dutch", + "dict/world/woltdata/sandic06.dat", + "dict/world/woltdata/edindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"fie", "WSOY Finnish->English", + "dict/world/werndata/sandic32.dat", + "dict/world/werndata/fieindex", + worldnextoff, worldprintentry, worldprintkey}, + {"efi", "WSOY English->Finnish", + "dict/world/werndata/sandic31.dat", + "dict/world/werndata/efiindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"fe", "Collins French->English", + "dict/fe", "dict/feindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"ef", "Collins English->French", + "dict/ef", "dict/efindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"ge", "Collins German->English", + "dict/ge", "dict/geindex", + pcollgnextoff, pcollgprintentry, pcollgprintkey}, + {"eg", "Collins English->German", + "dict/eg", "dict/egindex", + pcollgnextoff, pcollgprintentry, pcollgprintkey}, + + {"ie", "Collins Italian->English", + "dict/ie", "dict/ieindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"ei", "Collins English->Italian", + "dict/ei", "dict/eiindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"je", "Sanshusha Japanese->English", + "dict/world/sansdata/sandic18.dat", + "dict/world/sansdata/jeindex", + worldnextoff, worldprintentry, worldprintkey}, + {"jek", "Sanshusha Japanese->English (Kanji index)", + "dict/world/sansdata/sandic18.dat", + "dict/world/sansdata/jekindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ej", "Sanshusha English->Japanese", + "dict/world/sansdata/sandic18.dat", + "dict/world/sansdata/ejindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"tjeg", "Sanshusha technical Japanese->English,German", + "dict/world/sansdata/sandic16.dat", + "dict/world/sansdata/tjegindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)", + "dict/world/sansdata/sandic16.dat", + "dict/world/sansdata/tjegkindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tegj", "Sanshusha technical English->German,Japanese", + "dict/world/sansdata/sandic16.dat", + "dict/world/sansdata/tegjindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tgje", "Sanshusha technical German->Japanese,English", + "dict/world/sansdata/sandic16.dat", + "dict/world/sansdata/tgjeindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"ne", "Kunnskapforlaget Norwegian->English", + "dict/world/kunndata/sandic28.dat", + "dict/world/kunndata/neindex", + worldnextoff, worldprintentry, worldprintkey}, + {"en", "Kunnskapforlaget English->Norwegian", + "dict/world/kunndata/sandic27.dat", + "dict/world/kunndata/enindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"re", "Leon Ungier Russian->English", + "dict/re", "dict/reindex", + simplenextoff, simpleprintentry, simpleprintkey}, + {"er", "Leon Ungier English->Russian", + "dict/re", "dict/erindex", + simplenextoff, simpleprintentry, simpleprintkey}, + + {"se", "Collins Spanish->English", + "dict/se", "dict/seindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"es", "Collins English->Spanish", + "dict/es", "dict/esindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"swe", "Esselte Studium Swedish->English", + "dict/world/essedata/sandic34.dat", + "dict/world/essedata/sweindex", + worldnextoff, worldprintentry, worldprintkey}, + {"esw", "Esselte Studium English->Swedish", + "dict/world/essedata/sandic33.dat", + "dict/world/essedata/eswindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"movie", "Movies -- by title", + "movie/data", "dict/movtindex", + movienextoff, movieprintentry, movieprintkey}, + {"moviea", "Movies -- by actor", + "movie/data", "dict/movaindex", + movienextoff, movieprintentry, movieprintkey}, + {"movied", "Movies -- by director", + "movie/data", "dict/movdindex", + movienextoff, movieprintentry, movieprintkey}, + + {"slang", "English Slang", + "dict/slang", "dict/slangindex", + slangnextoff, slangprintentry, slangprintkey}, + + {"robert", "Robert Électronique", + "dict/robert/_pointers", "dict/robert/_index", + robertnextoff, robertindexentry, robertprintkey}, + {"robertv", "Robert Électronique - formes des verbes", + "dict/robert/flex.rob", "dict/robert/_flexindex", + robertnextflex, robertflexentry, robertprintkey}, + + {0, 0, 0, 0, 0} +}; + +typedef struct Lig Lig; +struct Lig { + Rune start; /* accent rune */ + Rune pairs[100]; /* pairs */ +}; + +static Lig ligtab[Nligs] = { +[LACU-LIGS] {0xb4, {0x41, 0xc1, 0x61, 0xe1, 0x43, 0x106, 0x63, 0x107, 0x45, 0xc9, 0x65, 0xe9, 0x67, 0x123, 0x49, 0xcd, 0x69, 0xed, 0x131, 0xed, 0x4c, 0x139, 0x6c, 0x13a, 0x4e, 0x143, 0x6e, 0x144, 0x4f, 0xd3, 0x6f, 0xf3, 0x52, 0x154, 0x72, 0x155, 0x53, 0x15a, 0x73, 0x15b, 0x55, 0xda, 0x75, 0xfa, 0x59, 0xdd, 0x79, 0xfd, 0x5a, 0x179, 0x7a, 0x17a, 0}}, +[LGRV-LIGS] {0x2cb, {0x41, 0xc0, 0x61, 0xe0, 0x45, 0xc8, 0x65, 0xe8, 0x49, 0xcc, 0x69, 0xec, 0x131, 0xec, 0x4f, 0xd2, 0x6f, 0xf2, 0x55, 0xd9, 0x75, 0xf9, 0}}, +[LUML-LIGS] {0xa8, {0x41, 0xc4, 0x61, 0xe4, 0x45, 0xcb, 0x65, 0xeb, 0x49, 0xcf, 0x69, 0xef, 0x4f, 0xd6, 0x6f, 0xf6, 0x55, 0xdc, 0x75, 0xfc, 0x59, 0x178, 0x79, 0xff, 0}}, +[LCED-LIGS] {0xb8, {0x43, 0xc7, 0x63, 0xe7, 0x47, 0x122, 0x4b, 0x136, 0x6b, 0x137, 0x4c, 0x13b, 0x6c, 0x13c, 0x4e, 0x145, 0x6e, 0x146, 0x52, 0x156, 0x72, 0x157, 0x53, 0x15e, 0x73, 0x15f, 0x54, 0x162, 0x74, 0x163, 0}}, +[LTIL-LIGS] {0x2dc, {0x41, 0xc3, 0x61, 0xe3, 0x49, 0x128, 0x69, 0x129, 0x131, 0x129, 0x4e, 0xd1, 0x6e, 0xf1, 0x4f, 0xd5, 0x6f, 0xf5, 0x55, 0x168, 0x75, 0x169, 0}}, +[LBRV-LIGS] {0x2d8, {0x41, 0x102, 0x61, 0x103, 0x45, 0x114, 0x65, 0x115, 0x47, 0x11e, 0x67, 0x11f, 0x49, 0x12c, 0x69, 0x12d, 0x131, 0x12d, 0x4f, 0x14e, 0x6f, 0x14f, 0x55, 0x16c, 0x75, 0x16d, 0}}, +[LRNG-LIGS] {0x2da, {0x41, 0xc5, 0x61, 0xe5, 0x55, 0x16e, 0x75, 0x16f, 0}}, +[LDOT-LIGS] {0x2d9, {0x43, 0x10a, 0x63, 0x10b, 0x45, 0x116, 0x65, 0x117, 0x47, 0x120, 0x67, 0x121, 0x49, 0x130, 0x4c, 0x13f, 0x6c, 0x140, 0x5a, 0x17b, 0x7a, 0x17c, 0}}, +[LDTB-LIGS] {0x2e, {0}}, +[LFRN-LIGS] {0x2322, {0x41, 0xc2, 0x61, 0xe2, 0x43, 0x108, 0x63, 0x109, 0x45, 0xca, 0x65, 0xea, 0x47, 0x11c, 0x67, 0x11d, 0x48, 0x124, 0x68, 0x125, 0x49, 0xce, 0x69, 0xee, 0x131, 0xee, 0x4a, 0x134, 0x6a, 0x135, 0x4f, 0xd4, 0x6f, 0xf4, 0x53, 0x15c, 0x73, 0x15d, 0x55, 0xdb, 0x75, 0xfb, 0x57, 0x174, 0x77, 0x175, 0x59, 0x176, 0x79, 0x177, 0}}, +[LFRB-LIGS] {0x32f, {0}}, +[LOGO-LIGS] {0x2db, {0x41, 0x104, 0x61, 0x105, 0x45, 0x118, 0x65, 0x119, 0x49, 0x12e, 0x69, 0x12f, 0x131, 0x12f, 0x55, 0x172, 0x75, 0x173, 0}}, +[LMAC-LIGS] {0xaf, {0x41, 0x100, 0x61, 0x101, 0x45, 0x112, 0x65, 0x113, 0x49, 0x12a, 0x69, 0x12b, 0x131, 0x12b, 0x4f, 0x14c, 0x6f, 0x14d, 0x55, 0x16a, 0x75, 0x16b, 0}}, +[LHCK-LIGS] {0x2c7, {0x43, 0x10c, 0x63, 0x10d, 0x44, 0x10e, 0x64, 0x10f, 0x45, 0x11a, 0x65, 0x11b, 0x4c, 0x13d, 0x6c, 0x13e, 0x4e, 0x147, 0x6e, 0x148, 0x52, 0x158, 0x72, 0x159, 0x53, 0x160, 0x73, 0x161, 0x54, 0x164, 0x74, 0x165, 0x5a, 0x17d, 0x7a, 0x17e, 0}}, +[LASP-LIGS] {0x2bd, {0}}, +[LLEN-LIGS] {0x2bc, {0}}, +[LBRB-LIGS] {0x32e, {0}} +}; + +Rune multitab[Nmulti][5] = { +[MAAS-MULTI] {0x2bd, 0x3b1, 0}, +[MALN-MULTI] {0x2bc, 0x3b1, 0}, +[MAND-MULTI] {0x61, 0x6e, 0x64, 0}, +[MAOQ-MULTI] {0x61, 0x2f, 0x71, 0}, +[MBRA-MULTI] {0x3c, 0x7c, 0}, +[MDD-MULTI] {0x2e, 0x2e, 0}, +[MDDD-MULTI] {0x2e, 0x2e, 0x2e, 0}, +[MEAS-MULTI] {0x2bd, 0x3b5, 0}, +[MELN-MULTI] {0x2bc, 0x3b5, 0}, +[MEMM-MULTI] {0x2014, 0x2014, 0}, +[MHAS-MULTI] {0x2bd, 0x3b7, 0}, +[MHLN-MULTI] {0x2bc, 0x3b7, 0}, +[MIAS-MULTI] {0x2bd, 0x3b9, 0}, +[MILN-MULTI] {0x2bc, 0x3b9, 0}, +[MLCT-MULTI] {0x63, 0x74, 0}, +[MLFF-MULTI] {0x66, 0x66, 0}, +[MLFFI-MULTI] {0x66, 0x66, 0x69, 0}, +[MLFFL-MULTI] {0x66, 0x66, 0x6c, 0}, +[MLFL-MULTI] {0x66, 0x6c, 0}, +[MLFI-MULTI] {0x66, 0x69, 0}, +[MLLS-MULTI] {0x26b, 0x26b, 0}, +[MLST-MULTI] {0x73, 0x74, 0}, +[MOAS-MULTI] {0x2bd, 0x3bf, 0}, +[MOLN-MULTI] {0x2bc, 0x3bf, 0}, +[MOR-MULTI] {0x6f, 0x72, 0}, +[MRAS-MULTI] {0x2bd, 0x3c1, 0}, +[MRLN-MULTI] {0x2bc, 0x3c1, 0}, +[MTT-MULTI] {0x7e, 0x7e, 0}, +[MUAS-MULTI] {0x2bd, 0x3c5, 0}, +[MULN-MULTI] {0x2bc, 0x3c5, 0}, +[MWAS-MULTI] {0x2bd, 0x3c9, 0}, +[MWLN-MULTI] {0x2bc, 0x3c9, 0}, +[MOE-MULTI] {0x6f, 0x65, 0}, +[MES-MULTI] {0x20, 0x20, 0}, +}; + +#define risupper(r) (0x41 <= (r) && (r) <= 0x5a) +#define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF) +#define rtolower(r) ((r)-'A'+'a') + +static Rune latin_fold_tab[] = +{ +/* Table to fold latin 1 characters to ASCII equivalents + based at Rune value 0xc0 + + À Á Â Ã Ä Å Æ Ç + È É Ê Ë Ì Í Î Ï + Ð Ñ Ò Ó Ô Õ Ö × + Ø Ù Ú Û Ü Ý Þ ß + à á â ã ä å æ ç + è é ê ë ì í î ï + ð ñ ò ó ô õ ö ÷ + ø ù ú û ü ý þ ÿ +*/ + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , + 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 , + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , + 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y', +}; + +static Rune *ttabstack[20]; +static int ntt; + +/* + * tab is an array of n Assoc's, sorted by key. + * Look for key in tab, and return corresponding val + * or -1 if not there + */ +long +lookassoc(Assoc *tab, int n, char *key) +{ + Assoc *q; + long i, low, high; + int r; + + for(low = -1, high = n; high > low+1; ){ + i = (high+low)/2; + q = &tab[i]; + if((r=strcmp(key, q->key))<0) + high = i; + else if(r == 0) + return q->val; + else + low=i; + } + return -1; +} + +long +looknassoc(Nassoc *tab, int n, long key) +{ + Nassoc *q; + long i, low, high; + + for(low = -1, high = n; high > low+1; ){ + i = (high+low)/2; + q = &tab[i]; + if(key < q->key) + high = i; + else if(key == q->key) + return q->val; + else + low=i; + } + return -1; +} + +void +err(char *fmt, ...) +{ + char buf[1000]; + va_list v; + + va_start(v, fmt); + vsnprint(buf, sizeof(buf), fmt, v); + va_end(v); + fprint(2, "%s: %s\n", argv0, buf); +} + +/* + * Write the rune r to bout, keeping track of line length + * and breaking the lines (at blanks) when they get too long + */ +void +outrune(long r) +{ + if(outinhibit) + return; + if(++linelen > breaklen && r == 0x20) { + Bputc(bout, '\n'); + linelen = 0; + } else + Bputrune(bout, r); +} + +void +outrunes(Rune *rp) +{ + Rune r; + + while((r = *rp++) != 0) + outrune(r); +} + +/* like outrune, but when arg is know to be a char */ +void +outchar(int c) +{ + if(outinhibit) + return; + if(++linelen > breaklen && c == ' ') { + c ='\n'; + linelen = 0; + } + Bputc(bout, c); +} + +void +outchars(char *s) +{ + char c; + + while((c = *s++) != 0) + outchar(c); +} + +void +outprint(char *fmt, ...) +{ + char buf[1000]; + va_list v; + + va_start(v, fmt); + vsnprint(buf, sizeof(buf), fmt, v); + va_end(v); + outchars(buf); +} + +void +outpiece(char *b, char *e) +{ + int c, lastc; + + lastc = 0; + while(b < e) { + c = *b++; + if(c == '\n') + c = ' '; + if(!(c == ' ' && lastc == ' ')) + outchar(c); + lastc = c; + } +} + +/* + * Go to new line if not already there; indent if ind != 0. + * If ind > 1, leave a blank line too. + * Slight hack: assume if current line is only one or two + * characters long, then they were spaces. + */ +void +outnl(int ind) +{ + if(outinhibit) + return; + if(ind) { + if(ind > 1) { + if(linelen > 2) + Bputc(bout, '\n'); + Bprint(bout, "\n "); + } else if(linelen == 0) + Bprint(bout, " "); + else if(linelen == 1) + Bputc(bout, ' '); + else if(linelen != 2) + Bprint(bout, "\n "); + linelen = 2; + } else { + if(linelen) { + Bputc(bout, '\n'); + linelen = 0; + } + } +} + +/* + * Fold the runes in null-terminated rp. + * Use the sort(1) definition of folding (uppercase to lowercase, + * latin1-accented characters to corresponding unaccented chars) + */ +void +fold(Rune *rp) +{ + Rune r; + + while((r = *rp) != 0) { + if (rislatin1(r) && latin_fold_tab[r-0xc0]) + r = latin_fold_tab[r-0xc0]; + if(risupper(r)) + r = rtolower(r); + *rp++ = r; + } +} + +/* + * Like fold, but put folded result into new + * (assumed to have enough space). + * old is a regular expression, but we know that + * metacharacters aren't affected + */ +void +foldre(char *new, char *old) +{ + Rune r; + + while(*old) { + old += chartorune(&r, old); + if (rislatin1(r) && latin_fold_tab[r-0xc0]) + r = latin_fold_tab[r-0xc0]; + if(risupper(r)) + r = rtolower(r); + new += runetochar(new, &r); + } + *new = 0; +} + +/* + * acomp(s, t) returns: + * -2 if s strictly precedes t + * -1 if s is a prefix of t + * 0 if s is the same as t + * 1 if t is a prefix of s + * 2 if t strictly precedes s + */ + +int +acomp(Rune *s, Rune *t) +{ + int cs, ct; + + for(;;) { + cs = *s; + ct = *t; + if(cs != ct) + break; + if(cs == 0) + return 0; + s++; + t++; + } + if(cs == 0) + return -1; + if(ct == 0) + return 1; + if(cs < ct) + return -2; + return 2; +} + +/* + * Copy null terminated Runes from 'from' to 'to'. + */ +void +runescpy(Rune *to, Rune *from) +{ + while((*to++ = *from++) != 0) + continue; +} + +/* + * Conversion of unsigned number to long, no overflow detection + */ +long +runetol(Rune *r) +{ + int c; + long n; + + n = 0; + for(;; r++){ + c = *r; + if(0x30<=c && c<=0x39) + c -= '0'; + else + break; + n = n*10 + c; + } + return n; +} + +/* + * See if there is a rune corresponding to the accented + * version of r with accent acc (acc in [LIGS..LIGE-1]), + * and return it if so, else return NONE. + */ +Rune +liglookup(Rune acc, Rune r) +{ + Rune *p; + + if(acc < LIGS || acc >= LIGE) + return NONE; + for(p = ligtab[acc-LIGS].pairs; *p; p += 2) + if(*p == r) + return *(p+1); + return NONE; +} + +/* + * Maintain a translation table stack (a translation table + * is an array of Runes indexed by bytes or 7-bit bytes). + * If starting is true, push the curtab onto the stack + * and return newtab; else pop the top of the stack and + * return it. + * If curtab is 0, initialize the stack and return. + */ +Rune * +changett(Rune *curtab, Rune *newtab, int starting) +{ + if(curtab == 0) { + ntt = 0; + return 0; + } + if(starting) { + if(ntt >= asize(ttabstack)) { + if(debug) + err("translation stack overflow"); + return curtab; + } + ttabstack[ntt++] = curtab; + return newtab; + } else { + if(ntt == 0) { + if(debug) + err("translation stack underflow"); + return curtab; + } + return ttabstack[--ntt]; + } +} -- cgit v1.2.3