aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/cmd/tcs/conv.h2
-rw-r--r--src/cmd/tcs/conv_big5.c1
-rw-r--r--src/cmd/tcs/conv_gb.c1
-rw-r--r--src/cmd/tcs/conv_jis.c1
-rw-r--r--src/cmd/tcs/conv_ksc.c1
-rw-r--r--src/cmd/tcs/html.c128
-rw-r--r--src/cmd/tcs/mkfile6
-rw-r--r--src/cmd/tcs/tcs.c212
-rw-r--r--src/cmd/tcs/utf.c30
9 files changed, 320 insertions, 62 deletions
diff --git a/src/cmd/tcs/conv.h b/src/cmd/tcs/conv.h
index fc35a105..5e18a065 100644
--- a/src/cmd/tcs/conv.h
+++ b/src/cmd/tcs/conv.h
@@ -13,6 +13,8 @@ void uksc_in(int fd, long *notused, struct convert *out);
void uksc_out(Rune *base, int n, long *notused);
void html_in(int fd, long *notused, struct convert *out);
void html_out(Rune *base, int n, long *notused);
+void tune_in(int fd, long *notused, struct convert *out);
+void tune_out(Rune *base, int n, long *notused);
#define emit(x) *(*r)++ = (x)
#define NRUNE 65536
diff --git a/src/cmd/tcs/conv_big5.c b/src/cmd/tcs/conv_big5.c
index 111bf5c4..496cae66 100644
--- a/src/cmd/tcs/conv_big5.c
+++ b/src/cmd/tcs/conv_big5.c
@@ -110,6 +110,7 @@ big5_in(int fd, long *notused, struct convert *out)
big5proc(-1, &r, nin);
if(r > ob)
OUT(out, ob, r-ob);
+ OUT(out, ob, 0);
}
void
diff --git a/src/cmd/tcs/conv_gb.c b/src/cmd/tcs/conv_gb.c
index 70835257..6838b774 100644
--- a/src/cmd/tcs/conv_gb.c
+++ b/src/cmd/tcs/conv_gb.c
@@ -88,6 +88,7 @@ gb_in(int fd, long *notused, struct convert *out)
gbproc(-1, &r, nin);
if(r > ob)
OUT(out, ob, r-ob);
+ OUT(out, ob, 0);
}
void
diff --git a/src/cmd/tcs/conv_jis.c b/src/cmd/tcs/conv_jis.c
index 18579d70..86275141 100644
--- a/src/cmd/tcs/conv_jis.c
+++ b/src/cmd/tcs/conv_jis.c
@@ -363,6 +363,7 @@ do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
(*procfn)(-1, &r, nin);
if(r > ob)
OUT(out, ob, r-ob);
+ OUT(out, ob, 0);
}
void
diff --git a/src/cmd/tcs/conv_ksc.c b/src/cmd/tcs/conv_ksc.c
index cbc17f5b..293ffad1 100644
--- a/src/cmd/tcs/conv_ksc.c
+++ b/src/cmd/tcs/conv_ksc.c
@@ -109,6 +109,7 @@ uksc_in(int fd, long *notused, struct convert *out)
ukscproc(-1, &r, nin);
if(r > ob)
OUT(out, ob, r-ob);
+ OUT(out, ob, 0);
}
void
diff --git a/src/cmd/tcs/html.c b/src/cmd/tcs/html.c
index 8a27f1c2..89436060 100644
--- a/src/cmd/tcs/html.c
+++ b/src/cmd/tcs/html.c
@@ -19,132 +19,251 @@ static Hchar byname[] =
{"Aacute", 193},
{"Acirc", 194},
{"Agrave", 192},
+ {"Alpha", 913},
{"Aring", 197},
{"Atilde", 195},
{"Auml", 196},
+ {"Beta", 914},
{"Ccedil", 199},
+ {"Chi", 935},
+ {"Dagger", 8225},
+ {"Delta", 916},
{"ETH", 208},
{"Eacute", 201},
{"Ecirc", 202},
{"Egrave", 200},
+ {"Epsilon", 917},
+ {"Eta", 919},
{"Euml", 203},
+ {"Gamma", 915},
{"Iacute", 205},
{"Icirc", 206},
{"Igrave", 204},
+ {"Iota", 921},
{"Iuml", 207},
+ {"Kappa", 922},
+ {"Lambda", 923},
+ {"Mu", 924},
{"Ntilde", 209},
+ {"Nu", 925},
+ {"OElig", 338},
{"Oacute", 211},
{"Ocirc", 212},
{"Ograve", 210},
+ {"Omega", 937},
+ {"Omicron", 927},
{"Oslash", 216},
{"Otilde", 213},
{"Ouml", 214},
+ {"Phi", 934},
+ {"Pi", 928},
+ {"Prime", 8243},
+ {"Psi", 936},
+ {"Rho", 929},
+ {"Scaron", 352},
+ {"Sigma", 931},
{"THORN", 222},
+ {"Tau", 932},
+ {"Theta", 920},
{"Uacute", 218},
{"Ucirc", 219},
{"Ugrave", 217},
+ {"Upsilon", 933},
{"Uuml", 220},
+ {"Xi", 926},
{"Yacute", 221},
+ {"Yuml", 376},
+ {"Zeta", 918},
{"aacute", 225},
{"acirc", 226},
{"acute", 180},
{"aelig", 230},
{"agrave", 224},
+ {"alefsym", 8501},
{"alpha", 945},
+ {"amp", 38},
+ {"and", 8743},
+ {"ang", 8736},
{"aring", 229},
+ {"asymp", 8776},
{"atilde", 227},
{"auml", 228},
+ {"bdquo", 8222},
{"beta", 946},
{"brvbar", 166},
+ {"bull", 8226},
+ {"cap", 8745},
{"ccedil", 231},
{"cdots", 8943},
{"cedil", 184},
{"cent", 162},
{"chi", 967},
+ {"circ", 710},
+ {"clubs", 9827},
+ {"cong", 8773},
{"copy", 169},
+ {"crarr", 8629},
+ {"cup", 8746},
{"curren", 164},
+ {"dArr", 8659},
+ {"dagger", 8224},
+ {"darr", 8595},
{"ddots", 8945},
{"deg", 176},
{"delta", 948},
+ {"diams", 9830},
{"divide", 247},
{"eacute", 233},
{"ecirc", 234},
{"egrave", 232},
{"emdash", 8212}, /* non-standard but commonly used */
+ {"empty", 8709},
{"emsp", 8195},
{"endash", 8211}, /* non-standard but commonly used */
{"ensp", 8194},
{"epsilon", 949},
+ {"equiv", 8801},
{"eta", 951},
{"eth", 240},
{"euml", 235},
+ {"euro", 8364},
+ {"exist", 8707},
+ {"fnof", 402},
+ {"forall", 8704},
{"frac12", 189},
{"frac14", 188},
{"frac34", 190},
+ {"frasl", 8260},
{"gamma", 947},
+ {"ge", 8805},
+ {"gt", 62},
+ {"hArr", 8660},
+ {"harr", 8596},
+ {"hearts", 9829},
+ {"hellip", 8230},
{"iacute", 237},
{"icirc", 238},
{"iexcl", 161},
{"igrave", 236},
+ {"image", 8465},
+ {"infin", 8734},
+ {"int", 8747},
{"iota", 953},
{"iquest", 191},
+ {"isin", 8712},
{"iuml", 239},
{"kappa", 954},
+ {"lArr", 8656},
{"lambda", 955},
+ {"lang", 9001},
{"laquo", 171},
- {"ldquo", 8220},
+ {"larr", 8592},
+ {"lceil", 8968},
{"ldots", 8230},
+ {"ldquo", 8220},
+ {"le", 8804},
+ {"lfloor", 8970},
+ {"lowast", 8727},
+ {"loz", 9674},
+ {"lrm", 8206},
+ {"lsaquo", 8249},
{"lsquo", 8216},
+ {"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
{"middot", 183},
+ {"minus", 8722},
{"mu", 956},
+ {"nabla", 8711},
{"nbsp", 160},
{"ndash", 8211},
+ {"ne", 8800},
+ {"ni", 8715},
{"not", 172},
+ {"notin", 8713},
+ {"nsub", 8836},
{"ntilde", 241},
{"nu", 957},
{"oacute", 243},
{"ocirc", 244},
+ {"oelig", 339},
{"ograve", 242},
+ {"oline", 8254},
{"omega", 969},
{"omicron", 959},
+ {"oplus", 8853},
+ {"or", 8744},
{"ordf", 170},
{"ordm", 186},
{"oslash", 248},
{"otilde", 245},
+ {"otimes", 8855},
{"ouml", 246},
{"para", 182},
+ {"part", 8706},
+ {"permil", 8240},
+ {"perp", 8869},
{"phi", 966},
{"pi", 960},
+ {"piv", 982},
{"plusmn", 177},
{"pound", 163},
+ {"prime", 8242},
+ {"prod", 8719},
+ {"prop", 8733},
{"psi", 968},
{"quad", 8193},
+ {"quot", 34},
+ {"rArr", 8658},
+ {"radic", 8730},
+ {"rang", 9002},
{"raquo", 187},
+ {"rarr", 8594},
+ {"rceil", 8969},
{"rdquo", 8221},
+ {"real", 8476},
{"reg", 174},
+ {"rfloor", 8971},
{"rho", 961},
+ {"rlm", 8207},
+ {"rsaquo", 8250},
{"rsquo", 8217},
+ {"sbquo", 8218},
+ {"scaron", 353},
+ {"sdot", 8901},
{"sect", 167},
{"shy", 173},
{"sigma", 963},
+ {"sigmaf", 962},
+ {"sim", 8764},
{"sp", 8194},
+ {"spades", 9824},
+ {"sub", 8834},
+ {"sube", 8838},
+ {"sum", 8721},
+ {"sup", 8835},
{"sup1", 185},
{"sup2", 178},
{"sup3", 179},
+ {"supe", 8839},
{"szlig", 223},
{"tau", 964},
+ {"there4", 8756},
{"theta", 952},
+ {"thetasym", 977},
{"thinsp", 8201},
{"thorn", 254},
+ {"tilde", 732},
{"times", 215},
{"trade", 8482},
+ {"uArr", 8657},
{"uacute", 250},
+ {"uarr", 8593},
{"ucirc", 251},
{"ugrave", 249},
{"uml", 168},
+ {"upsih", 978},
{"upsilon", 965},
{"uuml", 252},
{"varepsilon", 8712},
@@ -154,11 +273,14 @@ static Hchar byname[] =
{"vdots", 8942},
{"vsigma", 962},
{"vtheta", 977},
+ {"weierp", 8472},
{"xi", 958},
{"yacute", 253},
{"yen", 165},
{"yuml", 255},
- {"zeta", 950}
+ {"zeta", 950},
+ {"zwj", 8205},
+ {"zwnj", 8204}
};
static Hchar byrune[nelem(byname)];
@@ -302,6 +424,7 @@ html_in(int fd, long *x, struct convert *out)
}
if(r > rbuf)
OUT(out, rbuf, r-rbuf);
+ OUT(out, rbuf, 0);
}
/*
@@ -314,6 +437,7 @@ html_out(Rune *r, int n, long *x)
Biobuf b;
Rune *er;
+ USED(x);
html_init();
Binit(&b, 1, OWRITE);
er = r+n;
diff --git a/src/cmd/tcs/mkfile b/src/cmd/tcs/mkfile
index fc1ce5ab..1b86ab94 100644
--- a/src/cmd/tcs/mkfile
+++ b/src/cmd/tcs/mkfile
@@ -11,7 +11,8 @@ OFILES=tcs.$O\
kuten208.$O\
gb.$O\
ksc.$O\
- big5.$O
+ big5.$O\
+ tune.$O\
<$PLAN9/src/mkone
CFLAGS= -DPLAN9 $CFLAGS
@@ -23,6 +24,9 @@ tcs.$O big5.$O: big5.h
tcs.$O gb.$O: gb.h
tcs.$O: cyrillic.h
tcs.$O: conv.h
+tcs.$O: 8859.h
+tcs.$O: ms.h
+tcs.$O: misc.h
conv%.$O: conv.h
conv_ksc.$O: ksc.h
diff --git a/src/cmd/tcs/tcs.c b/src/cmd/tcs/tcs.c
index bb2f61f7..d7d18e41 100644
--- a/src/cmd/tcs/tcs.c
+++ b/src/cmd/tcs/tcs.c
@@ -54,7 +54,7 @@ main(int argc, char **argv)
clean = 1;
break;
case 'f':
- from = ARGF();
+ from = EARGF(usage());
break;
case 'l':
listem = 1;
@@ -63,7 +63,7 @@ main(int argc, char **argv)
squawk = 0;
break;
case 't':
- to = ARGF();
+ to = EARGF(usage());
break;
case 'v':
verbose = 1;
@@ -160,7 +160,7 @@ conv(char *name, int from)
struct convert *c;
for(c = convert; c->name; c++){
- if(strcmp(c->name, name) != 0)
+ if(cistrcmp(c->name, name) != 0)
continue;
if(c->flags&Table)
return(c);
@@ -208,23 +208,79 @@ unicode_in(int fd, long *notused, struct convert *out)
}
while((n = read(fd, (char *)buf, 2*N)) > 0){
ninput += n;
+ if(swabme)
+ swab2((char *)buf, n);
if(n&1){
if(squawk)
EPR "%s: odd byte count in %s\n", argv0, file);
nerrors++;
if(clean)
n--;
- else {
- n++;
- buf[n/2] = Runeerror;
- if(swabme) /* swab so later swab undoes it */
- swab2((char *)&buf[n/2], 2);
- }
+ else
+ buf[n++/2] = Runeerror;
+ }
+ OUT(out, buf, n/2);
+ }
+}
+
+void
+unicode_in_be(int fd, long *notused, struct convert *out)
+{
+ int i, n;
+ Rune buf[N], r;
+ uchar *p;
+
+ USED(notused);
+ while((n = read(fd, (char *)buf, 2*N)) > 0){
+ ninput += n;
+ p = (uchar*)buf;
+ for(i=0; i<n/2; i++){
+ r = *p++<<8;
+ r |= *p++;
+ buf[i] = r;
+ }
+ if(n&1){
+ if(squawk)
+ EPR "%s: odd byte count in %s\n", argv0, file);
+ nerrors++;
+ if(clean)
+ n--;
+ else
+ buf[n++/2] = Runeerror;
}
- if(swabme)
- swab2((char *)buf, n);
OUT(out, buf, n/2);
}
+ OUT(out, buf, 0);
+}
+
+void
+unicode_in_le(int fd, long *notused, struct convert *out)
+{
+ int i, n;
+ Rune buf[N], r;
+ uchar *p;
+
+ USED(notused);
+ while((n = read(fd, (char *)buf, 2*N)) > 0){
+ ninput += n;
+ p = (uchar*)buf;
+ for(i=0; i<n/2; i++){
+ r = *p++;
+ r |= *p++<<8;
+ buf[i] = r;
+ }
+ if(n&1){
+ if(squawk)
+ EPR "%s: odd byte count in %s\n", argv0, file);
+ nerrors++;
+ if(clean)
+ n--;
+ else
+ buf[n++/2] = Runeerror;
+ }
+ OUT(out, buf, n/2);
+ }
+ OUT(out, buf, 0);
}
void
@@ -245,6 +301,44 @@ unicode_out(Rune *base, int n, long *notused)
}
void
+unicode_out_be(Rune *base, int n, long *notused)
+{
+ int i;
+ uchar *p;
+ Rune r;
+
+ USED(notused);
+ p = (uchar*)base;
+ for(i=0; i<n; i++){
+ r = base[i];
+ *p++ = r>>8;
+ *p++ = r;
+ }
+ nrunes += n;
+ noutput += 2*n;
+ write(1, (char *)base, 2*n);
+}
+
+void
+unicode_out_le(Rune *base, int n, long *notused)
+{
+ int i;
+ uchar *p;
+ Rune r;
+
+ USED(notused);
+ p = (uchar*)base;
+ for(i=0; i<n; i++){
+ r = base[i];
+ *p++ = r;
+ *p++ = r>>8;
+ }
+ nrunes += n;
+ noutput += 2*n;
+ write(1, (char *)base, 2*n);
+}
+
+void
intable(int fd, long *table, struct convert *out)
{
uchar buf[N];
@@ -270,6 +364,7 @@ intable(int fd, long *table, struct convert *out)
}
OUT(out, runes, r-runes);
}
+ OUT(out, runes, 0);
if(n < 0){
#ifdef PLAN9
EPR "%s: input read: %r\n", argv0);
@@ -403,64 +498,91 @@ struct convert convert[] =
{ "av", "Alternativnyj Variant", Table, (void *)tabav },
{ "big5", "Big 5 (HKU)", From|Func, 0, (Fnptr)big5_in },
{ "big5", "Big 5 (HKU)", Func, 0, (Fnptr)big5_out },
- { "cp437", "Code Page 437 (US)", Table, (void*)tabcp437 },
- { "cp720", "Code Page 720 (Arabic)", Table, (void*)tabcp720 },
- { "cp737", "Code Page 737 (Greek)", Table, (void*)tabcp737 },
- { "cp775", "Code Page 775 (Baltic)", Table, (void*)tabcp775 },
- { "cp850", "Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 },
- { "cp852", "Code Page 852 (Latin II)", Table, (void*)tabcp852 },
- { "cp855", "Code Page 855 (Cyrillic)", Table, (void*)tabcp855 },
- { "cp857", "Code Page 857 (Turkish)", Table, (void*)tabcp857 },
- { "cp858", "Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 },
- { "cp862", "Code Page 862 (Hebrew)", Table, (void*)tabcp862 },
- { "cp866", "Code Page 866 (Russian)", Table, (void*)tabcp866 },
- { "cp874", "Code Page 874 (Thai)", Table, (void*)tabcp874 },
- { "cp1250", "Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 },
- { "cp1251", "Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 },
- { "cp1252", "Code Page 1252 (Latin I)", Table, (void *)tabcp1252 },
- { "cp1253", "Code Page 1253 (Greek)", Table, (void *)tabcp1253 },
- { "cp1254", "Code Page 1254 (Turkish)", Table, (void *)tabcp1254 },
- { "cp1255", "Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 },
- { "cp1256", "Code Page 1256 (Arabic)", Table, (void *)tabcp1256 },
- { "cp1257", "Code Page 1257 (Baltic)", Table, (void *)tabcp1257 },
- { "cp1258", "Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 },
{ "ebcdic", "EBCDIC", Table, (void *)tabebcdic }, /* 6f is recommended bad map */
{ "euc-k", "Korean EUC: ASCII+KS C 5601 1987", From|Func, 0, (Fnptr)uksc_in },
{ "euc-k", "Korean EUC: ASCII+KS C 5601 1987", Func, 0, (Fnptr)uksc_out },
- { "gb", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in },
- { "gb", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out },
+ { "gb2312", "GB2312-80 (Chinese)", From|Func, 0, (Fnptr)gb_in },
+ { "gb2312", "GB2312-80 (Chinese)", Func, 0, (Fnptr)gb_out },
{ "html", "HTML", From|Func, 0, (Fnptr)html_in },
{ "html", "HTML", Func, 0, (Fnptr)html_out },
+ { "ibm437", "IBM Code Page 437 (US)", Table, (void*)tabcp437 },
+ { "ibm720", "IBM Code Page 720 (Arabic)", Table, (void*)tabcp720 },
+ { "ibm737", "IBM Code Page 737 (Greek)", Table, (void*)tabcp737 },
+ { "ibm775", "IBM Code Page 775 (Baltic)", Table, (void*)tabcp775 },
+ { "ibm850", "IBM Code Page 850 (Multilingual Latin I)", Table, (void*)tabcp850 },
+ { "ibm852", "IBM Code Page 852 (Latin II)", Table, (void*)tabcp852 },
+ { "ibm855", "IBM Code Page 855 (Cyrillic)", Table, (void*)tabcp855 },
+ { "ibm857", "IBM Code Page 857 (Turkish)", Table, (void*)tabcp857 },
+ { "ibm858", "IBM Code Page 858 (Multilingual Latin I+Euro)", Table, (void*)tabcp858 },
+ { "ibm862", "IBM Code Page 862 (Hebrew)", Table, (void*)tabcp862 },
+ { "ibm866", "IBM Code Page 866 (Russian)", Table, (void*)tabcp866 },
+ { "ibm874", "IBM Code Page 874 (Thai)", Table, (void*)tabcp874 },
+ { "iso-2022-jp", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jisjis_in },
+ { "iso-2022-jp", "alias for jis-kanji (MIME)", Func, 0, (Fnptr)jisjis_out },
+ { "iso-8859-1", "alias for 8859-1 (MIME)", Table, (void *)tab8859_1 },
+ { "iso-8859-2", "alias for 8859-2 (MIME)", Table, (void *)tab8859_2 },
+ { "iso-8859-3", "alias for 8859-3 (MIME)", Table, (void *)tab8859_3 },
+ { "iso-8859-4", "alias for 8859-4 (MIME)", Table, (void *)tab8859_4 },
+ { "iso-8859-5", "alias for 8859-5 (MIME)", Table, (void *)tab8859_5 },
+ { "iso-8859-6", "alias for 8859-6 (MIME)", Table, (void *)tab8859_6 },
+ { "iso-8859-7", "alias for 8859-7 (MIME)", Table, (void *)tab8859_7 },
+ { "iso-8859-8", "alias for 8859-8 (MIME)", Table, (void *)tab8859_8 },
+ { "iso-8859-9", "alias for 8859-9 (MIME)", Table, (void *)tab8859_9 },
+ { "iso-8859-10", "alias for 8859-10 (MIME)", Table, (void *)tab8859_10 },
+ { "iso-8859-15", "alias for 8859-15 (MIME)", Table, (void *)tab8859_15 },
{ "jis", "guesses at the JIS encoding", From|Func, 0, (Fnptr)jis_in },
{ "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jisjis_in },
{ "jis-kanji", "ISO 2022-JP (Japanese)", Func, 0, (Fnptr)jisjis_out },
{ "koi8", "KOI-8 (GOST 19769-74)", Table, (void *)tabkoi8 },
- { "latin1", "ISO 8859-1", Table, (void *)tab8859_1 },
+ { "koi8-r", "alias for koi8 (MIME)", Table, (void *)tabkoi8 },
+ { "latin1", "alias for 8859-1", Table, (void *)tab8859_1 },
{ "macrom", "Macintosh Standard Roman character set", Table, (void *)tabmacroman },
- { "microsoft", "Windows (CP 1252)", Table, (void *)tabcp1252 },
- { "msdos", "IBM PC (CP 437)", Table, (void *)tabcp437 },
- { "msdos2", "IBM PC (CP 437 with graphics in C0)", Table, (void *)tabmsdos2 },
+ { "microsoft", "alias for windows1252", Table, (void *)tabcp1252 },
{ "ms-kanji", "Microsoft, or Shift-JIS", From|Func, 0, (Fnptr)msjis_in },
{ "ms-kanji", "Microsoft, or Shift-JIS", Func, 0, (Fnptr)msjis_out },
+ { "msdos", "IBM PC (alias for ibm437)", Table, (void *)tabcp437 },
+ { "msdos2", "IBM PC (ibm437 with graphics in C0)", Table, (void *)tabmsdos2 },
{ "next", "NEXTSTEP character set", Table, (void *)tabnextstep },
{ "ov", "Osnovnoj Variant", Table, (void *)tabov },
- { "ps2", "IBM PS/2: (CP 850)", Table, (void *)tabcp850 },
+ { "ps2", "IBM PS/2: (alias for ibm850)", Table, (void *)tabcp850 },
{ "sf1", "ISO-646: Finnish/Swedish SF-1 variant", Table, (void *)tabsf1 },
{ "sf2", "ISO-646: Finnish/Swedish SF-2 variant (recommended)", Table, (void *)tabsf2 },
- { "tis", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 },
+ { "tis-620", "Thai+ASCII (TIS 620-1986)", Table, (void *)tabtis620 },
+ { "tune", "TUNE (Tamil)", From|Func, 0, (Fnptr)tune_in },
+ { "tune", "TUNE (Tamil)", Func, 0, (Fnptr)tune_out },
{ "ucode", "Russian U-code", Table, (void *)tabucode },
{ "ujis", "EUC-JX: JIS 0208", From|Func, 0, (Fnptr)ujis_in },
{ "ujis", "EUC-JX: JIS 0208", Func, 0, (Fnptr)ujis_out },
{ "unicode", "Unicode 1.1", From|Func, 0, (Fnptr)unicode_in },
{ "unicode", "Unicode 1.1", Func, 0, (Fnptr)unicode_out },
- { "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in },
- { "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out },
+ { "unicode-be", "Unicode 1.1 big-endian", From|Func, 0, (Fnptr)unicode_in_be },
+ { "unicode-be", "Unicode 1.1 big-endian", Func, 0, (Fnptr)unicode_out_be },
+ { "unicode-le", "Unicode 1.1 little-endian", From|Func, 0, (Fnptr)unicode_in_le },
+ { "unicode-le", "Unicode 1.1 little-endian", Func, 0, (Fnptr)unicode_out_le },
+ { "us-ascii", "alias for ascii (MIME)", Table, (void *)tabascii },
{ "utf", "FSS-UTF a.k.a. UTF-8", From|Func, 0, (Fnptr)utf_in },
{ "utf", "FSS-UTF a.k.a. UTF-8", Func, 0, (Fnptr)utf_out },
- { "utf-l2", "from", From|Func, 0, (Fnptr)utf_in },
- { "utf-l2", "to", Func, 0, (Fnptr)utf_out },
+ { "utf1", "UTF-1 (ISO 10646 Annex A)", From|Func, 0, (Fnptr)isoutf_in },
+ { "utf1", "UTF-1 (ISO 10646 Annex A)", Func, 0, (Fnptr)isoutf_out },
+ { "utf-8", "alias for utf (MIME)", From|Func, 0, (Fnptr)utf_in },
+ { "utf-8", "alias for utf (MIME)", Func, 0, (Fnptr)utf_out },
+ { "utf-16", "alias for unicode (MIME)", From|Func, 0, (Fnptr)unicode_in },
+ { "utf-16", "alias for unicode (MIME)", Func, 0, (Fnptr)unicode_out },
+ { "utf-16be", "alias for unicode-be (MIME)", From|Func, 0, (Fnptr)unicode_in_be },
+ { "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be },
+ { "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le },
+ { "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le },
{ "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 },
{ "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 },
- { "viscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii },
+ { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii },
+ { "windows-1250", "Windows Code Page 1250 (Central Europe)", Table, (void *)tabcp1250 },
+ { "windows-1251", "Windows Code Page 1251 (Cyrillic)", Table, (void *)tabcp1251 },
+ { "windows-1252", "Windows Code Page 1252 (Latin I)", Table, (void *)tabcp1252 },
+ { "windows-1253", "Windows Code Page 1253 (Greek)", Table, (void *)tabcp1253 },
+ { "windows-1254", "Windows Code Page 1254 (Turkish)", Table, (void *)tabcp1254 },
+ { "windows-1255", "Windows Code Page 1255 (Hebrew)", Table, (void *)tabcp1255 },
+ { "windows-1256", "Windows Code Page 1256 (Arabic)", Table, (void *)tabcp1256 },
+ { "windows-1257", "Windows Code Page 1257 (Baltic)", Table, (void *)tabcp1257 },
+ { "windows-1258", "Windows Code Page 1258 (Vietnam)", Table, (void *)tabcp1258 },
{ 0 }
};
diff --git a/src/cmd/tcs/utf.c b/src/cmd/tcs/utf.c
index 9aad892b..f87a310b 100644
--- a/src/cmd/tcs/utf.c
+++ b/src/cmd/tcs/utf.c
@@ -45,15 +45,15 @@ utf_in(int fd, long *notused, struct convert *out)
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
- for(i=j=0; i<tot; ){
+ for(i=j=0; i<tot-UTFmax || (n==0 && i<tot); ){
c = our_mbtowc(&l, buf+i, tot-i);
- if(c == -2)
- break;
if(c == -1){
if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
- if(clean)
+ if(clean){
+ i++;
continue;
+ }
nerrors++;
l = Runeerror;
c = 1;
@@ -69,6 +69,7 @@ utf_in(int fd, long *notused, struct convert *out)
if(n == 0)
break;
}
+ OUT(out, runes, 0);
}
void
@@ -100,11 +101,13 @@ isoutf_in(int fd, long *notused, struct convert *out)
if(!fullisorune(buf+i, tot-i))
break;
c = isochartorune(&runes[j], buf+i);
- if(runes[j] == Runeerror){
+ if(runes[j] == Runeerror && c == 1){
if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
- if(clean)
+ if(clean){
+ i++;
continue;
+ }
nerrors++;
}
j++;
@@ -118,6 +121,7 @@ isoutf_in(int fd, long *notused, struct convert *out)
if(n == 0)
break;
}
+ OUT(out, runes, 0);
}
void
@@ -393,19 +397,19 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
return 0; /* no shift states */
if(n < 1)
- goto badlen;
+ goto bad;
us = (uchar*)s;
c0 = us[0];
if(c0 >= T3) {
if(n < 3)
- goto badlen;
+ goto bad;
c1 = us[1] ^ Tx;
c2 = us[2] ^ Tx;
if((c1|c2) & T2)
goto bad;
if(c0 >= T5) {
if(n < 5)
- goto badlen;
+ goto bad;
c3 = us[3] ^ Tx;
c4 = us[4] ^ Tx;
if((c3|c4) & T2)
@@ -413,7 +417,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T6) {
/* 6 bytes */
if(n < 6)
- goto badlen;
+ goto bad;
c5 = us[5] ^ Tx;
if(c5 & T2)
goto bad;
@@ -437,7 +441,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T4) {
/* 4 bytes */
if(n < 4)
- goto badlen;
+ goto bad;
c3 = us[3] ^ Tx;
if(c3 & T2)
goto bad;
@@ -460,7 +464,7 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
if(c0 >= T2) {
/* 2 bytes */
if(n < 2)
- goto badlen;
+ goto bad;
c1 = us[1] ^ Tx;
if(c1 & T2)
goto bad;
@@ -480,6 +484,4 @@ our_mbtowc(unsigned long *p, char *s, unsigned n)
bad:
errno = EILSEQ;
return -1;
-badlen:
- return -2;
}