diff options
-rw-r--r-- | src/cmd/tcs/conv.h | 2 | ||||
-rw-r--r-- | src/cmd/tcs/conv_big5.c | 1 | ||||
-rw-r--r-- | src/cmd/tcs/conv_gb.c | 1 | ||||
-rw-r--r-- | src/cmd/tcs/conv_jis.c | 1 | ||||
-rw-r--r-- | src/cmd/tcs/conv_ksc.c | 1 | ||||
-rw-r--r-- | src/cmd/tcs/html.c | 128 | ||||
-rw-r--r-- | src/cmd/tcs/mkfile | 6 | ||||
-rw-r--r-- | src/cmd/tcs/tcs.c | 212 | ||||
-rw-r--r-- | src/cmd/tcs/utf.c | 30 |
9 files changed, 320 insertions, 62 deletions
diff --git a/src/cmd/tcs/conv.h b/src/cmd/tcs/conv.h index fc35a105..5e18a065 100644 --- a/src/cmd/tcs/conv.h +++ b/src/cmd/tcs/conv.h @@ -13,6 +13,8 @@ void uksc_in(int fd, long *notused, struct convert *out); void uksc_out(Rune *base, int n, long *notused); void html_in(int fd, long *notused, struct convert *out); void html_out(Rune *base, int n, long *notused); +void tune_in(int fd, long *notused, struct convert *out); +void tune_out(Rune *base, int n, long *notused); #define emit(x) *(*r)++ = (x) #define NRUNE 65536 diff --git a/src/cmd/tcs/conv_big5.c b/src/cmd/tcs/conv_big5.c index 111bf5c4..496cae66 100644 --- a/src/cmd/tcs/conv_big5.c +++ b/src/cmd/tcs/conv_big5.c @@ -110,6 +110,7 @@ big5_in(int fd, long *notused, struct convert *out) big5proc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void diff --git a/src/cmd/tcs/conv_gb.c b/src/cmd/tcs/conv_gb.c index 70835257..6838b774 100644 --- a/src/cmd/tcs/conv_gb.c +++ b/src/cmd/tcs/conv_gb.c @@ -88,6 +88,7 @@ gb_in(int fd, long *notused, struct convert *out) gbproc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void diff --git a/src/cmd/tcs/conv_jis.c b/src/cmd/tcs/conv_jis.c index 18579d70..86275141 100644 --- a/src/cmd/tcs/conv_jis.c +++ b/src/cmd/tcs/conv_jis.c @@ -363,6 +363,7 @@ do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out) (*procfn)(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void diff --git a/src/cmd/tcs/conv_ksc.c b/src/cmd/tcs/conv_ksc.c index cbc17f5b..293ffad1 100644 --- a/src/cmd/tcs/conv_ksc.c +++ b/src/cmd/tcs/conv_ksc.c @@ -109,6 +109,7 @@ uksc_in(int fd, long *notused, struct convert *out) ukscproc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void diff --git a/src/cmd/tcs/html.c b/src/cmd/tcs/html.c index 8a27f1c2..89436060 100644 --- a/src/cmd/tcs/html.c +++ b/src/cmd/tcs/html.c @@ -19,132 +19,251 @@ static Hchar byname[] = {"Aacute", 193}, {"Acirc", 194}, {"Agrave", 192}, + {"Alpha", 913}, {"Aring", 197}, {"Atilde", 195}, {"Auml", 196}, + {"Beta", 914}, {"Ccedil", 199}, + {"Chi", 935}, + {"Dagger", 8225}, + {"Delta", 916}, {"ETH", 208}, {"Eacute", 201}, {"Ecirc", 202}, {"Egrave", 200}, + {"Epsilon", 917}, + {"Eta", 919}, {"Euml", 203}, + {"Gamma", 915}, {"Iacute", 205}, {"Icirc", 206}, {"Igrave", 204}, + {"Iota", 921}, {"Iuml", 207}, + {"Kappa", 922}, + {"Lambda", 923}, + {"Mu", 924}, {"Ntilde", 209}, + {"Nu", 925}, + {"OElig", 338}, {"Oacute", 211}, {"Ocirc", 212}, {"Ograve", 210}, + {"Omega", 937}, + {"Omicron", 927}, {"Oslash", 216}, {"Otilde", 213}, {"Ouml", 214}, + {"Phi", 934}, + {"Pi", 928}, + {"Prime", 8243}, + {"Psi", 936}, + {"Rho", 929}, + {"Scaron", 352}, + {"Sigma", 931}, {"THORN", 222}, + {"Tau", 932}, + {"Theta", 920}, {"Uacute", 218}, {"Ucirc", 219}, {"Ugrave", 217}, + {"Upsilon", 933}, {"Uuml", 220}, + {"Xi", 926}, {"Yacute", 221}, + {"Yuml", 376}, + {"Zeta", 918}, {"aacute", 225}, {"acirc", 226}, {"acute", 180}, {"aelig", 230}, {"agrave", 224}, + {"alefsym", 8501}, {"alpha", 945}, + {"amp", 38}, + {"and", 8743}, + {"ang", 8736}, {"aring", 229}, + {"asymp", 8776}, {"atilde", 227}, {"auml", 228}, + {"bdquo", 8222}, {"beta", 946}, {"brvbar", 166}, + {"bull", 8226}, + {"cap", 8745}, {"ccedil", 231}, {"cdots", 8943}, {"cedil", 184}, {"cent", 162}, {"chi", 967}, + {"circ", 710}, + {"clubs", 9827}, + {"cong", 8773}, {"copy", 169}, + {"crarr", 8629}, + {"cup", 8746}, {"curren", 164}, + {"dArr", 8659}, + {"dagger", 8224}, + {"darr", 8595}, {"ddots", 8945}, {"deg", 176}, {"delta", 948}, + {"diams", 9830}, {"divide", 247}, {"eacute", 233}, {"ecirc", 234}, {"egrave", 232}, {"emdash", 8212}, /* non-standard but commonly used */ + {"empty", 8709}, {"emsp", 8195}, {"endash", 8211}, /* non-standard but commonly used */ {"ensp", 8194}, {"epsilon", 949}, + {"equiv", 8801}, {"eta", 951}, {"eth", 240}, {"euml", 235}, + {"euro", 8364}, + {"exist", 8707}, + {"fnof", 402}, + {"forall", 8704}, {"frac12", 189}, {"frac14", 188}, {"frac34", 190}, + {"frasl", 8260}, {"gamma", 947}, + {"ge", 8805}, + {"gt", 62}, + {"hArr", 8660}, + {"harr", 8596}, + {"hearts", 9829}, + {"hellip", 8230}, {"iacute", 237}, {"icirc", 238}, {"iexcl", 161}, {"igrave", 236}, + {"image", 8465}, + {"infin", 8734}, + {"int", 8747}, {"iota", 953}, {"iquest", 191}, + {"isin", 8712}, {"iuml", 239}, {"kappa", 954}, + {"lArr", 8656}, {"lambda", 955}, + {"lang", 9001}, {"laquo", 171}, - {"ldquo", 8220}, + {"larr", 8592}, + {"lceil", 8968}, {"ldots", 8230}, + {"ldquo", 8220}, + {"le", 8804}, + {"lfloor", 8970}, + {"lowast", 8727}, + {"loz", 9674}, + {"lrm", 8206}, + {"lsaquo", 8249}, {"lsquo", 8216}, + {"lt", 60}, {"macr", 175}, {"mdash", 8212}, {"micro", 181}, {"middot", 183}, + {"minus", 8722}, {"mu", 956}, + {"nabla", 8711}, {"nbsp", 160}, {"ndash", 8211}, + {"ne", 8800}, + {"ni", 8715}, {"not", 172}, + {"notin", 8713}, + {"nsub", 8836}, {"ntilde", 241}, {"nu", 957}, {"oacute", 243}, {"ocirc", 244}, + {"oelig", 339}, {"ograve", 242}, + {"oline", 8254}, {"omega", 969}, {"omicron", 959}, + {"oplus", 8853}, + {"or", 8744}, {"ordf", 170}, {"ordm", 186}, {"oslash", 248}, {"otilde", 245}, + {"otimes", 8855}, {"ouml", 246}, {"para", 182}, + {"part", 8706}, + {"permil", 8240}, + {"perp", 8869}, {"phi", 966}, {"pi", 960}, + {"piv", 982}, {"plusmn", 177}, {"pound", 163}, + {"prime", 8242}, + {"prod", 8719}, + {"prop", 8733}, {"psi", 968}, {"quad", 8193}, + {"quot", 34}, + {"rArr", 8658}, + {"radic", 8730}, + {"rang", 9002}, {"raquo", 187}, + {"rarr", 8594}, + {"rceil", 8969}, {"rdquo", 8221}, + {"real", 8476}, {"reg", 174}, + {"rfloor", 8971}, {"rho", 961}, + {"rlm", 8207}, + {"rsaquo", 8250}, {"rsquo", 8217}, + {"sbquo", 8218}, + {"scaron", 353}, + {"sdot", 8901}, {"sect", 167}, {"shy", 173}, {"sigma", 963}, + {"sigmaf", 962}, + {"sim", 8764}, {"sp", 8194}, + {"spades", 9824}, + {"sub", 8834}, + {"sube", 8838}, + {"sum", 8721}, + {"sup", 8835}, {"sup1", 185}, {"sup2", 178}, {"sup3", 179}, + {"supe", 8839}, {"szlig", 223}, {"tau", 964}, + {"there4", 8756}, {"theta", 952}, + {"thetasym", 977}, {"thinsp", 8201}, {"thorn", 254}, + {"tilde", 732}, {"times", 215}, {"trade", 8482}, + {"uArr", 8657}, {"uacute", 250}, + {"uarr", 8593}, {"ucirc", 251}, {"ugrave", 249}, {"uml", 168}, + {"upsih", 978}, {"upsilon", 965}, {"uuml", 252}, {"varepsilon", 8712}, @@ -154,11 +273,14 @@ static Hchar byname[] = {"vdots", 8942}, {"vsigma", 962}, {"vtheta", 977}, + {"weierp", 8472}, {"xi", 958}, {"yacute", 253}, {"yen", 165}, {"yuml", 255}, - {"zeta", 950} + {"zeta", 950}, + {"zwj", 8205}, + {"zwnj", 8204} }; static Hchar byrune[nelem(byname)]; @@ -302,6 +424,7 @@ html_in(int fd, long *x, struct convert *out) } if(r > rbuf) OUT(out, rbuf, r-rbuf); + OUT(out, rbuf, 0); } /* @@ -314,6 +437,7 @@ html_out(Rune *r, int n, long *x) Biobuf b; Rune *er; + USED(x); html_init(); Binit(&b, 1, OWRITE); er = r+n; diff --git a/src/cmd/tcs/mkfile b/src/cmd/tcs/mkfile index fc1ce5ab..1b86ab94 100644 --- a/src/cmd/tcs/mkfile +++ b/src/cmd/tcs/mkfile @@ -11,7 +11,8 @@ OFILES=tcs.$O\ kuten208.$O\ gb.$O\ ksc.$O\ - big5.$O + big5.$O\ + tune.$O\ <$PLAN9/src/mkone CFLAGS= -DPLAN9 $CFLAGS @@ -23,6 +24,9 @@ tcs.$O big5.$O: big5.h tcs.$O gb.$O: gb.h tcs.$O: cyrillic.h tcs.$O: conv.h +tcs.$O: 8859.h +tcs.$O: ms.h +tcs.$O: misc.h conv%.$O: conv.h conv_ksc.$O: ksc.h diff --git a/src/cmd/tcs/tcs.c b/src/cmd/tcs/tcs.c index bb2f61f7..d7d18e41 100644 --- a/src/cmd/tcs/tcs.c +++ b/src/cmd/tcs/tcs.c @@ -54,7 +54,7 @@ main(int argc, char **argv) clean = 1; break; case 'f': - from = ARGF(); + from = EARGF(usage()); break; case 'l': listem = 1; @@ -63,7 +63,7 @@ main(int argc, char **argv) squawk = 0; break; case 't': - to = ARGF(); + to = EARGF(usage()); break; case 'v': verbose = 1; @@ -160,7 +160,7 @@ conv(char *name, int from) struct convert *c; for(c = convert; c->name; c++){ - if(strcmp(c->name, name) != 0) + if(cistrcmp(c->name, name) != 0) continue; if(c->flags&Table) return(c); @@ -208,23 +208,79 @@ unicode_in(int fd, long *notused, struct convert *out) } while((n = read(fd, (char *)buf, 2*N)) > 0){ ninput += n; + if(swabme) + swab2((char *)buf, n); if(n&1){ if(squawk) EPR "%s: odd byte count in %s\n", argv0, file); nerrors++; if(clean) n--; - else { - n++; - buf[n/2] = Runeerror; - if(swabme) /* swab so later swab undoes it */ - swab2((char *)&buf[n/2], 2); - } + else + buf[n++/2] = Runeerror; + } + OUT(out, buf, n/2); + } +} + +void +unicode_in_be(int fd, long *notused, struct convert *out) +{ + int i, n; + Rune buf[N], r; + uchar *p; + + USED(notused); + while((n = read(fd, (char *)buf, 2*N)) > 0){ + ninput += n; + p = (uchar*)buf; + for(i=0; i<n/2; i++){ + r = *p++<<8; + r |= *p++; + buf[i] = r; + } + if(n&1){ + if(squawk) + EPR "%s: odd byte count in %s\n", argv0, file); + nerrors++; + if(clean) + n--; + else + buf[n++/2] = Runeerror; } - if(swabme) - swab2((char *)buf, n); OUT(out, buf, n/2); } + OUT(out, buf, 0); +} + +void +unicode_in_le(int fd, long *notused, struct convert *out) +{ + int i, n; + Rune buf[N], r; + uchar *p; + + USED(notused); + while((n = read(fd, (char *)buf, 2*N)) > 0){ + ninput += n; + p = (uchar*)buf; + for(i=0; i<n/2; i++){ + r = *p++; + r |= *p++<<8; + buf[i] = r; + } + if(n&1){ + if(squawk) + EPR "%s: odd byte count in %s\n", argv0, file); + nerrors++; + if(clean) + n--; + else + buf[n++/2] = Runeerror; + } + OUT(out, buf, n/2); + } + OUT(out, buf, 0); } void @@ -245,6 +301,44 @@ unicode_out(Rune *base, int n, long *notused) } void +unicode_out_be(Rune *base, int n, long *notused) +{ + int i; + uchar *p; + Rune r; + + USED(notused); + p = (uchar*)base; + for(i=0; i<n; i++){ + r = base[i]; + *p++ = r>>8; + *p++ = r; + } + nrunes += n; + noutput += 2*n; + write(1, (char *)base, 2*n); +} + +void +unicode_out_le(Rune *base, int n, long *notused) +{ + int i; + uchar *p; + Rune r; + + USED(notused); + p = (uchar*)base; + for(i=0; i<n; i++){ + r = base[i]; + *p++ = r; + *p++ = r>>8; + } + nrunes += n; + noutput += 2*n; + write(1, (char *)base, 2*n); +} + +void intable(int fd, long *table, struct convert *out) { uchar buf[N]; @@ -270,6 +364,7 @@ intable(int fd, long *table, struct convert *out) } OUT(out, runes, r-runes); } + OUT(out, runes, 0); if(n < 0){ #ifdef PLAN9 EPR "%s: input read: %r\n", argv0); @@ -403,64 +498,91 @@ struct convert convert[] = { "av", "Alternativnyj Variant", Table, (void *)tabav }, { "big5", "Big 5 (HKU)", From|Func, 0, (Fnptr)big5_in }, { "big5", "Big 5 (HKU)", Func, 0, (Fnptr)big5_out }, - { "cp437", "Code Page 437 (US)", Table, (void*)tabcp437 }, - { "cp720", "Code Page 720 (Arabic)", Table, (void*)tabcp720 }, - { "cp737", "Code Page 737 (Greek)", Table, (void*)tabcp737 }, - { "cp775", "Code Page 775 (Baltic)", Table, (void*)tabcp775 }, - { "cp850", "Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 }, - { "cp852", "Code Page 852 (Latin II)", Table, (void*)tabcp852 }, - { "cp855", "Code Page 855 (Cyrillic)", Table, (void*)tabcp855 }, - { "cp857", "Code Page 857 (Turkish)", Table, (void*)tabcp857 }, - { "cp858", "Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 }, - { "cp862", "Code Page 862 (Hebrew)", Table, (void*)tabcp862 }, - { "cp866", "Code Page 866 (Russian)", Table, (void*)tabcp866 }, - { "cp874", "Code Page 874 (Thai)", Table, (void*)tabcp874 }, - { "cp1250", "Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 }, - { "cp1251", "Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 }, - { "cp1252", "Code Page 1252 (Latin I)", Table, (void *)tabcp1252 }, - { "cp1253", "Code Page 1253 (Greek)", Table, (void *)tabcp1253 }, - { "cp1254", "Code Page 1254 (Turkish)", Table, (void *)tabcp1254 }, - { "cp1255", "Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 }, - { "cp1256", "Code Page 1256 (Arabic)", Table, (void *)tabcp1256 }, - { "cp1257", "Code Page 1257 (Baltic)", Table, (void *)tabcp1257 }, - { "cp1258", "Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 }, { "ebcdic", "EBCDIC", Table, (void *)tabebcdic }, /* 6f is recommended bad map */ { "euc-k", "Korean EUC: ASCII+KS C 5601 1987", From|Func, 0, (Fnptr)uksc_in }, { "euc-k", "Korean EUC: ASCII+KS C 5601 1987", Func, 0, (Fnptr)uksc_out }, - { "gb", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in }, - { "gb", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out }, + { "gb2312", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in }, + { "gb2312", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out }, { "html", "HTML", From|Func, 0, (Fnptr)html_in }, { "html", "HTML", Func, 0, (Fnptr)html_out }, + { "ibm437", "IBM Code Page 437 (US)", Table, (void*)tabcp437 }, + { "ibm720", "IBM Code Page 720 (Arabic)", Table, (void*)tabcp720 }, + { "ibm737", "IBM Code Page 737 (Greek)", Table, (void*)tabcp737 }, + { "ibm775", "IBM Code Page 775 (Baltic)", Table, (void*)tabcp775 }, + { "ibm850", "IBM Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 }, + { "ibm852", "IBM Code Page 852 (Latin II)", Table, (void*)tabcp852 }, + { "ibm855", "IBM Code Page 855 (Cyrillic)", Table, (void*)tabcp855 }, + { "ibm857", "IBM Code Page 857 (Turkish)", Table, (void*)tabcp857 }, + { "ibm858", "IBM Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 }, + { "ibm862", "IBM Code Page 862 (Hebrew)", Table, (void*)tabcp862 }, + { "ibm866", "IBM Code Page 866 (Russian)", Table, (void*)tabcp866 }, + { "ibm874", "IBM Code Page 874 (Thai)", Table, (void*)tabcp874 }, + { "iso-2022-jp", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jisjis_in }, + { "iso-2022-jp", "alias for jis-kanji (MIME)", Func, 0, (Fnptr)jisjis_out }, + { "iso-8859-1", "alias for 8859-1 (MIME)", Table, (void *)tab8859_1 }, + { "iso-8859-2", "alias for 8859-2 (MIME)", Table, (void *)tab8859_2 }, + { "iso-8859-3", "alias for 8859-3 (MIME)", Table, (void *)tab8859_3 }, + { "iso-8859-4", "alias for 8859-4 (MIME)", Table, (void *)tab8859_4 }, + { "iso-8859-5", "alias for 8859-5 (MIME)", Table, (void *)tab8859_5 }, + { "iso-8859-6", "alias for 8859-6 (MIME)", Table, (void *)tab8859_6 }, + { "iso-8859-7", "alias for 8859-7 (MIME)", Table, (void *)tab8859_7 }, + { "iso-8859-8", "alias for 8859-8 (MIME)", Table, (void *)tab8859_8 }, + { "iso-8859-9", "alias for 8859-9 (MIME)", Table, (void *)tab8859_9 }, + { "iso-8859-10", "alias for 8859-10 (MIME)", Table, (void *)tab8859_10 }, + { "iso-8859-15", "alias for 8859-15 (MIME)", Table, (void *)tab8859_15 }, { "jis", "guesses at the JIS encoding", From|Func, 0, (Fnptr)jis_in }, { "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jisjis_in }, { "jis-kanji", "ISO 2022-JP (Japanese)", Func, 0, (Fnptr)jisjis_out }, { "koi8", "KOI-8 (GOST 19769-74)", Table, (void *)tabkoi8 }, - { "latin1", "ISO 8859-1", Table, (void *)tab8859_1 }, + { "koi8-r", "alias for koi8 (MIME)", Table, (void *)tabkoi8 }, + { "latin1", "alias for 8859-1", Table, (void *)tab8859_1 }, { "macrom", "Macintosh Standard Roman character set", Table, (void *)tabmacroman }, - { "microsoft", "Windows (CP 1252)", Table, (void *)tabcp1252 }, - { "msdos", "IBM PC (CP 437)", Table, (void *)tabcp437 }, - { "msdos2", "IBM PC (CP 437 with graphics in C0)", Table, (void *)tabmsdos2 }, + { "microsoft", "alias for windows1252", Table, (void *)tabcp1252 }, { "ms-kanji", "Microsoft, or Shift-JIS", From|Func, 0, (Fnptr)msjis_in }, { "ms-kanji", "Microsoft, or Shift-JIS", Func, 0, (Fnptr)msjis_out }, + { "msdos", "IBM PC (alias for ibm437)", Table, (void *)tabcp437 }, + { "msdos2", "IBM PC (ibm437 with graphics in C0)", Table, (void *)tabmsdos2 }, { "next", "NEXTSTEP character set", Table, (void *)tabnextstep }, { "ov", "Osnovnoj Variant", Table, (void *)tabov }, - { "ps2", "IBM PS/2: (CP 850)", Table, (void *)tabcp850 }, + { "ps2", "IBM PS/2: (alias for ibm850)", Table, (void *)tabcp850 }, { "sf1", "ISO-646: Finnish/Swedish SF-1 variant", Table, (void *)tabsf1 }, { "sf2", "ISO-646: Finnish/Swedish SF-2 variant (recommended)", Table, (void *)tabsf2 }, - { "tis", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 }, + { "tis-620", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 }, + { "tune", "TUNE (Tamil)", From|Func, 0, (Fnptr)tune_in }, + { "tune", "TUNE (Tamil)", Func, 0, (Fnptr)tune_out }, { "ucode", "Russian U-code", Table, (void *)tabucode }, { "ujis", "EUC-JX: JIS 0208", From|Func, 0, (Fnptr)ujis_in }, { "ujis", "EUC-JX: JIS 0208", Func, 0, (Fnptr)ujis_out }, { "unicode", "Unicode 1.1", From|Func, 0, (Fnptr)unicode_in }, { "unicode", "Unicode 1.1", Func, 0, (Fnptr)unicode_out }, - { "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in }, - { "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out }, + { "unicode-be", "Unicode 1.1 big-endian", From|Func, 0, (Fnptr)unicode_in_be }, + { "unicode-be", "Unicode 1.1 big-endian", Func, 0, (Fnptr)unicode_out_be }, + { "unicode-le", "Unicode 1.1 little-endian", From|Func, 0, (Fnptr)unicode_in_le }, + { "unicode-le", "Unicode 1.1 little-endian", Func, 0, (Fnptr)unicode_out_le }, + { "us-ascii", "alias for ascii (MIME)", Table, (void *)tabascii }, { "utf", "FSS-UTF a.k.a. UTF-8", From|Func, 0, (Fnptr)utf_in }, { "utf", "FSS-UTF a.k.a. UTF-8", Func, 0, (Fnptr)utf_out }, - { "utf-l2", "from", From|Func, 0, (Fnptr)utf_in }, - { "utf-l2", "to", Func, 0, (Fnptr)utf_out }, + { "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in }, + { "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out }, + { "utf-8", "alias for utf (MIME)", From|Func, 0, (Fnptr)utf_in }, + { "utf-8", "alias for utf (MIME)", Func, 0, (Fnptr)utf_out }, + { "utf-16", "alias for unicode (MIME)", From|Func, 0, (Fnptr)unicode_in }, + { "utf-16", "alias for unicode (MIME)", Func, 0, (Fnptr)unicode_out }, + { "utf-16be", "alias for unicode-be (MIME)", From|Func, 0, (Fnptr)unicode_in_be }, + { "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be }, + { "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le }, + { "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le }, { "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 }, { "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 }, - { "viscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, + { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, + { "windows-1250", "Windows Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 }, + { "windows-1251", "Windows Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 }, + { "windows-1252", "Windows Code Page 1252 (Latin I)", Table, (void *)tabcp1252 }, + { "windows-1253", "Windows Code Page 1253 (Greek)", Table, (void *)tabcp1253 }, + { "windows-1254", "Windows Code Page 1254 (Turkish)", Table, (void *)tabcp1254 }, + { "windows-1255", "Windows Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 }, + { "windows-1256", "Windows Code Page 1256 (Arabic)", Table, (void *)tabcp1256 }, + { "windows-1257", "Windows Code Page 1257 (Baltic)", Table, (void *)tabcp1257 }, + { "windows-1258", "Windows Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 }, { 0 } }; diff --git a/src/cmd/tcs/utf.c b/src/cmd/tcs/utf.c index 9aad892b..f87a310b 100644 --- a/src/cmd/tcs/utf.c +++ b/src/cmd/tcs/utf.c @@ -45,15 +45,15 @@ utf_in(int fd, long *notused, struct convert *out) tot = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; - for(i=j=0; i<tot; ){ + for(i=j=0; i<tot-UTFmax || (n==0 && i<tot); ){ c = our_mbtowc(&l, buf+i, tot-i); - if(c == -2) - break; if(c == -1){ if(squawk) EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); - if(clean) + if(clean){ + i++; continue; + } nerrors++; l = Runeerror; c = 1; @@ -69,6 +69,7 @@ utf_in(int fd, long *notused, struct convert *out) if(n == 0) break; } + OUT(out, runes, 0); } void @@ -100,11 +101,13 @@ isoutf_in(int fd, long *notused, struct convert *out) if(!fullisorune(buf+i, tot-i)) break; c = isochartorune(&runes[j], buf+i); - if(runes[j] == Runeerror){ + if(runes[j] == Runeerror && c == 1){ if(squawk) EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); - if(clean) + if(clean){ + i++; continue; + } nerrors++; } j++; @@ -118,6 +121,7 @@ isoutf_in(int fd, long *notused, struct convert *out) if(n == 0) break; } + OUT(out, runes, 0); } void @@ -393,19 +397,19 @@ our_mbtowc(unsigned long *p, char *s, unsigned n) return 0; /* no shift states */ if(n < 1) - goto badlen; + goto bad; us = (uchar*)s; c0 = us[0]; if(c0 >= T3) { if(n < 3) - goto badlen; + goto bad; c1 = us[1] ^ Tx; c2 = us[2] ^ Tx; if((c1|c2) & T2) goto bad; if(c0 >= T5) { if(n < 5) - goto badlen; + goto bad; c3 = us[3] ^ Tx; c4 = us[4] ^ Tx; if((c3|c4) & T2) @@ -413,7 +417,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n) if(c0 >= T6) { /* 6 bytes */ if(n < 6) - goto badlen; + goto bad; c5 = us[5] ^ Tx; if(c5 & T2) goto bad; @@ -437,7 +441,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n) if(c0 >= T4) { /* 4 bytes */ if(n < 4) - goto badlen; + goto bad; c3 = us[3] ^ Tx; if(c3 & T2) goto bad; @@ -460,7 +464,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n) if(c0 >= T2) { /* 2 bytes */ if(n < 2) - goto badlen; + goto bad; c1 = us[1] ^ Tx; if(c1 & T2) goto bad; @@ -480,6 +484,4 @@ our_mbtowc(unsigned long *p, char *s, unsigned n) bad: errno = EILSEQ; return -1; -badlen: - return -2; } |