aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/tcs/utf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/tcs/utf.c')
-rw-r--r--src/cmd/tcs/utf.c581
1 files changed, 581 insertions, 0 deletions
diff --git a/src/cmd/tcs/utf.c b/src/cmd/tcs/utf.c
new file mode 100644
index 00000000..418c9e1c
--- /dev/null
+++ b/src/cmd/tcs/utf.c
@@ -0,0 +1,581 @@
+#ifdef PLAN9
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#else
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include "plan9.h"
+#endif
+#include "hdr.h"
+
+/*
+ the our_* routines are implementations for the corresponding library
+ routines. for a while, i tried to actually name them wctomb etc
+ but stopped that after i found a system which made wchar_t an
+ unsigned char.
+*/
+
+#ifdef PLAN9
+long getrune(Biobuf *);
+long getisorune(Biobuf *);
+#else
+long getrune(FILE *);
+long getisorune(FILE *);
+#endif
+int our_wctomb(char *s, unsigned long wc);
+int our_mbtowc(unsigned long *p, char *s, unsigned n);
+int runetoisoutf(char *str, Rune *rune);
+int fullisorune(char *str, int n);
+int isochartorune(Rune *rune, char *str);
+
+void
+utf_in(int fd, long *notused, struct convert *out)
+{
+#ifndef PLAN9
+ FILE *fp;
+#else /* PLAN9 */
+ Biobuf b;
+#endif /* PLAN9 */
+ Rune *r;
+ long l;
+
+ USED(notused);
+#ifndef PLAN9
+ if((fp = fdopen(fd, "r")) == NULL){
+ EPR "%s: input setup error: %s\n", argv0, strerror(errno));
+#else /* PLAN9 */
+ if(Binit(&b, fd, OREAD) < 0){
+ EPR "%s: input setup error: %r\n", argv0);
+#endif /* PLAN9 */
+ EXIT(1, "input error");
+ }
+ r = runes;
+ for(;;)
+#ifndef PLAN9
+ switch(l = getrune(fp))
+#else /* PLAN9 */
+ switch(l = getrune(&b))
+#endif /* PLAN9 */
+ {
+ case -1:
+ goto done;
+ case -2:
+ if(squawk)
+ EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
+ if(clean)
+ continue;
+ nerrors++;
+ l = Runeerror;
+ default:
+ *r++ = l;
+ if(r >= &runes[N]){
+ OUT(out, runes, r-runes);
+ r = runes;
+ }
+ }
+done:
+ if(r > runes)
+ OUT(out, runes, r-runes);
+}
+
+void
+utf_out(Rune *base, int n, long *notused)
+{
+ char *p;
+ Rune *r;
+
+ USED(notused);
+ nrunes += n;
+ for(r = base, p = obuf; n-- > 0; r++){
+ p += our_wctomb(p, *r);
+ }
+ noutput += p-obuf;
+ write(1, obuf, p-obuf);
+}
+
+void
+isoutf_in(int fd, long *notused, struct convert *out)
+{
+#ifndef PLAN9
+ FILE *fp;
+#else /* PLAN9 */
+ Biobuf b;
+#endif /* PLAN9 */
+ Rune *r;
+ long l;
+
+ USED(notused);
+#ifndef PLAN9
+ if((fp = fdopen(fd, "r")) == 0){
+ EPR "%s: input setup error: %s\n", argv0, strerror(errno));
+#else /* PLAN9 */
+ if(Binit(&b, fd, OREAD) < 0){
+ EPR "%s: input setup error: %r\n", argv0);
+#endif /* PLAN9 */
+ EXIT(1, "input error");
+ }
+ r = runes;
+ for(;;)
+#ifndef PLAN9
+ switch(l = getisorune(fp))
+#else /* PLAN9 */
+ switch(l = getisorune(&b))
+#endif /* PLAN9 */
+ {
+ case -1:
+ goto done;
+ case -2:
+ if(squawk)
+ EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
+ if(clean)
+ continue;
+ nerrors++;
+ l = Runeerror;
+ default:
+ *r++ = l;
+ if(r >= &runes[N]){
+ OUT(out, runes, r-runes);
+ r = runes;
+ }
+ }
+done:
+ if(r > runes)
+ OUT(out, runes, r-runes);
+}
+
+void
+isoutf_out(Rune *base, int n, long *notused)
+{
+ char *p;
+ Rune *r;
+
+ USED(notused);
+ nrunes += n;
+ for(r = base, p = obuf; n-- > 0; r++)
+ p += runetoisoutf(p, r);
+ noutput += p-obuf;
+ write(1, obuf, p-obuf);
+}
+
+long
+#ifndef PLAN9
+getrune(FILE *fp)
+#else /* PLAN9 */
+getrune(Biobuf *bp)
+#endif /* PLAN9 */
+{
+ int c, i;
+ char str[UTFmax]; /* MB_LEN_MAX really */
+ unsigned long l;
+ int n;
+
+ for(i = 0;;){
+#ifndef PLAN9
+ c = getc(fp);
+#else /* PLAN9 */
+ c = Bgetc(bp);
+#endif /* PLAN9 */
+ if(c < 0)
+ return(c);
+ ninput++;
+ str[i++] = c;
+ n = our_mbtowc(&l, str, i);
+ if(n == -1)
+ return(-2);
+ if(n > 0)
+ return(l);
+ }
+}
+
+long
+#ifndef PLAN9
+getisorune(FILE *fp)
+#else /* PLAN9 */
+getisorune(Biobuf *bp)
+#endif /* PLAN9 */
+{
+ int c, i;
+ Rune rune;
+ char str[UTFmax]; /* MB_LEN_MAX really */
+
+ for(i = 0;;){
+#ifndef PLAN9
+ c = getc(fp);
+#else /* PLAN9 */
+ c = Bgetc(bp);
+#endif /* PLAN9 */
+ if(c < 0)
+ return(c);
+ ninput++;
+ str[i++] = c;
+ if(fullisorune(str, i))
+ break;
+ }
+ isochartorune(&rune, str);
+ if(rune == Runeerror)
+ return -2;
+ return(rune);
+}
+
+enum
+{
+ Char1 = Runeself, Rune1 = Runeself,
+ Char21 = 0xA1, Rune21 = 0x0100,
+ Char22 = 0xF6, Rune22 = 0x4016,
+ Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
+ Esc = 0xBE, Bad = Runeerror
+};
+
+static uchar U[256];
+static uchar T[256];
+
+static
+void
+mktable(void)
+{
+ int i, u;
+
+ for(i=0; i<256; i++) {
+ u = i + (0x5E - 0xA0);
+ if(i < 0xA0)
+ u = i + (0xDF - 0x7F);
+ if(i < 0x7F)
+ u = i + (0x00 - 0x21);
+ if(i < 0x21)
+ u = i + (0xBE - 0x00);
+ U[i] = u;
+ T[u] = i;
+ }
+}
+
+int
+isochartorune(Rune *rune, char *str)
+{
+ int c, c1, c2;
+ long l;
+
+ if(U[0] == 0)
+ mktable();
+
+ /*
+ * one character sequence
+ * 00000-0009F => 00-9F
+ */
+ c = *(uchar*)str;
+ if(c < Char1) {
+ *rune = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 000A0-000FF => A0; A0-FF
+ */
+ c1 = *(uchar*)(str+1);
+ if(c < Char21) {
+ if(c1 >= Rune1 && c1 < Rune21) {
+ *rune = c1;
+ return 2;
+ }
+ goto bad;
+ }
+
+ /*
+ * two character sequence
+ * 00100-04015 => A1-F5; 21-7E/A0-FF
+ */
+ c1 = U[c1];
+ if(c1 >= Esc)
+ goto bad;
+ if(c < Char22) {
+ *rune = (c-Char21)*Esc + c1 + Rune21;
+ return 2;
+ }
+
+ /*
+ * three character sequence
+ * 04016-38E2D => A6-FB; 21-7E/A0-FF
+ */
+ c2 = U[*(uchar*)(str+2)];
+ if(c2 >= Esc)
+ goto bad;
+ if(c < Char3) {
+ l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
+ if(l >= Rune3)
+ goto bad;
+ *rune = l;
+ return 3;
+ }
+
+ /*
+ * bad decoding
+ */
+bad:
+ *rune = Bad;
+ return 1;
+}
+
+int
+runetoisoutf(char *str, Rune *rune)
+{
+ long c;
+
+ if(T[0] == 0)
+ mktable();
+
+ /*
+ * one character sequence
+ * 00000-0009F => 00-9F
+ */
+ c = *rune;
+ if(c < Rune1) {
+ str[0] = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 000A0-000FF => A0; A0-FF
+ */
+ if(c < Rune21) {
+ str[0] = (uchar)Char1;
+ str[1] = c;
+ return 2;
+ }
+
+ /*
+ * two character sequence
+ * 00100-04015 => A1-F5; 21-7E/A0-FF
+ */
+ if(c < Rune22) {
+ c -= Rune21;
+ str[0] = c/Esc + Char21;
+ str[1] = T[c%Esc];
+ return 2;
+ }
+
+ /*
+ * three character sequence
+ * 04016-38E2D => A6-FB; 21-7E/A0-FF
+ */
+ c -= Rune22;
+ str[0] = c/(Esc*Esc) + Char22;
+ str[1] = T[c/Esc%Esc];
+ str[2] = T[c%Esc];
+ return 3;
+}
+
+int
+fullisorune(char *str, int n)
+{
+ int c;
+
+ if(n > 0) {
+ c = *(uchar*)str;
+ if(c < Char1)
+ return 1;
+ if(n > 1)
+ if(c < Char22 || n > 2)
+ return 1;
+ }
+ return 0;
+}
+
+#ifdef PLAN9
+int errno;
+#endif
+
+enum
+{
+ T1 = 0x00,
+ Tx = 0x80,
+ T2 = 0xC0,
+ T3 = 0xE0,
+ T4 = 0xF0,
+ T5 = 0xF8,
+ T6 = 0xFC,
+
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+ Bit5 = 2,
+ Bit6 = 2,
+
+ Mask1 = (1<<Bit1)-1,
+ Maskx = (1<<Bitx)-1,
+ Mask2 = (1<<Bit2)-1,
+ Mask3 = (1<<Bit3)-1,
+ Mask4 = (1<<Bit4)-1,
+ Mask5 = (1<<Bit5)-1,
+ Mask6 = (1<<Bit6)-1,
+
+ Wchar1 = (1UL<<Bit1)-1,
+ Wchar2 = (1UL<<(Bit2+Bitx))-1,
+ Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
+ Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
+ Wchar5 = (1UL<<(Bit5+4*Bitx))-1
+
+#ifndef EILSEQ
+ , /* we hate ansi c's comma rules */
+ EILSEQ = 123
+#endif /* PLAN9 */
+};
+
+int
+our_wctomb(char *s, unsigned long wc)
+{
+ if(s == 0)
+ return 0; /* no shift states */
+ if(wc & ~Wchar2) {
+ if(wc & ~Wchar4) {
+ if(wc & ~Wchar5) {
+ /* 6 bytes */
+ s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
+ s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
+ s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
+ s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
+ s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
+ s[5] = Tx | (wc & Maskx);
+ return 6;
+ }
+ /* 5 bytes */
+ s[0] = T5 | (wc >> 4*Bitx);
+ s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
+ s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
+ s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
+ s[4] = Tx | (wc & Maskx);
+ return 5;
+ }
+ if(wc & ~Wchar3) {
+ /* 4 bytes */
+ s[0] = T4 | (wc >> 3*Bitx);
+ s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
+ s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
+ s[3] = Tx | (wc & Maskx);
+ return 4;
+ }
+ /* 3 bytes */
+ s[0] = T3 | (wc >> 2*Bitx);
+ s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
+ s[2] = Tx | (wc & Maskx);
+ return 3;
+ }
+ if(wc & ~Wchar1) {
+ /* 2 bytes */
+ s[0] = T2 | (wc >> 1*Bitx);
+ s[1] = Tx | (wc & Maskx);
+ return 2;
+ }
+ /* 1 byte */
+ s[0] = T1 | wc;
+ return 1;
+}
+
+int
+our_mbtowc(unsigned long *p, char *s, unsigned n)
+{
+ uchar *us;
+ int c0, c1, c2, c3, c4, c5;
+ unsigned long wc;
+
+ if(s == 0)
+ return 0; /* no shift states */
+
+ if(n < 1)
+ goto badlen;
+ us = (uchar*)s;
+ c0 = us[0];
+ if(c0 >= T3) {
+ if(n < 3)
+ goto badlen;
+ c1 = us[1] ^ Tx;
+ c2 = us[2] ^ Tx;
+ if((c1|c2) & T2)
+ goto bad;
+ if(c0 >= T5) {
+ if(n < 5)
+ goto badlen;
+ c3 = us[3] ^ Tx;
+ c4 = us[4] ^ Tx;
+ if((c3|c4) & T2)
+ goto bad;
+ if(c0 >= T6) {
+ /* 6 bytes */
+ if(n < 6)
+ goto badlen;
+ c5 = us[5] ^ Tx;
+ if(c5 & T2)
+ goto bad;
+ wc = ((((((((((c0 & Mask6) << Bitx) |
+ c1) << Bitx) | c2) << Bitx) |
+ c3) << Bitx) | c4) << Bitx) | c5;
+ if(wc <= Wchar5)
+ goto bad;
+ *p = wc;
+ return 6;
+ }
+ /* 5 bytes */
+ wc = ((((((((c0 & Mask5) << Bitx) |
+ c1) << Bitx) | c2) << Bitx) |
+ c3) << Bitx) | c4;
+ if(wc <= Wchar4)
+ goto bad;
+ *p = wc;
+ return 5;
+ }
+ if(c0 >= T4) {
+ /* 4 bytes */
+ if(n < 4)
+ goto badlen;
+ c3 = us[3] ^ Tx;
+ if(c3 & T2)
+ goto bad;
+ wc = ((((((c0 & Mask4) << Bitx) |
+ c1) << Bitx) | c2) << Bitx) |
+ c3;
+ if(wc <= Wchar3)
+ goto bad;
+ *p = wc;
+ return 4;
+ }
+ /* 3 bytes */
+ wc = ((((c0 & Mask3) << Bitx) |
+ c1) << Bitx) | c2;
+ if(wc <= Wchar2)
+ goto bad;
+ *p = wc;
+ return 3;
+ }
+ if(c0 >= T2) {
+ /* 2 bytes */
+ if(n < 2)
+ goto badlen;
+ c1 = us[1] ^ Tx;
+ if(c1 & T2)
+ goto bad;
+ wc = ((c0 & Mask2) << Bitx) |
+ c1;
+ if(wc <= Wchar1)
+ goto bad;
+ *p = wc;
+ return 2;
+ }
+ /* 1 byte */
+ if(c0 >= Tx)
+ goto bad;
+ *p = c0;
+ return 1;
+
+bad:
+ errno = EILSEQ;
+ return -1;
+badlen:
+ return -2;
+}