diff options
Diffstat (limited to 'src/cmd/tcs/utf.c')
-rw-r--r-- | src/cmd/tcs/utf.c | 581 |
1 files changed, 581 insertions, 0 deletions
diff --git a/src/cmd/tcs/utf.c b/src/cmd/tcs/utf.c new file mode 100644 index 00000000..418c9e1c --- /dev/null +++ b/src/cmd/tcs/utf.c @@ -0,0 +1,581 @@ +#ifdef PLAN9 +#include <u.h> +#include <libc.h> +#include <bio.h> +#else +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include "plan9.h" +#endif +#include "hdr.h" + +/* + the our_* routines are implementations for the corresponding library + routines. for a while, i tried to actually name them wctomb etc + but stopped that after i found a system which made wchar_t an + unsigned char. +*/ + +#ifdef PLAN9 +long getrune(Biobuf *); +long getisorune(Biobuf *); +#else +long getrune(FILE *); +long getisorune(FILE *); +#endif +int our_wctomb(char *s, unsigned long wc); +int our_mbtowc(unsigned long *p, char *s, unsigned n); +int runetoisoutf(char *str, Rune *rune); +int fullisorune(char *str, int n); +int isochartorune(Rune *rune, char *str); + +void +utf_in(int fd, long *notused, struct convert *out) +{ +#ifndef PLAN9 + FILE *fp; +#else /* PLAN9 */ + Biobuf b; +#endif /* PLAN9 */ + Rune *r; + long l; + + USED(notused); +#ifndef PLAN9 + if((fp = fdopen(fd, "r")) == NULL){ + EPR "%s: input setup error: %s\n", argv0, strerror(errno)); +#else /* PLAN9 */ + if(Binit(&b, fd, OREAD) < 0){ + EPR "%s: input setup error: %r\n", argv0); +#endif /* PLAN9 */ + EXIT(1, "input error"); + } + r = runes; + for(;;) +#ifndef PLAN9 + switch(l = getrune(fp)) +#else /* PLAN9 */ + switch(l = getrune(&b)) +#endif /* PLAN9 */ + { + case -1: + goto done; + case -2: + if(squawk) + EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput); + if(clean) + continue; + nerrors++; + l = Runeerror; + default: + *r++ = l; + if(r >= &runes[N]){ + OUT(out, runes, r-runes); + r = runes; + } + } +done: + if(r > runes) + OUT(out, runes, r-runes); +} + +void +utf_out(Rune *base, int n, long *notused) +{ + char *p; + Rune *r; + + USED(notused); + nrunes += n; + for(r = base, p = obuf; n-- > 0; r++){ + p += our_wctomb(p, *r); + } + noutput += p-obuf; + write(1, obuf, p-obuf); +} + +void +isoutf_in(int fd, long *notused, struct convert *out) +{ +#ifndef PLAN9 + FILE *fp; +#else /* PLAN9 */ + Biobuf b; +#endif /* PLAN9 */ + Rune *r; + long l; + + USED(notused); +#ifndef PLAN9 + if((fp = fdopen(fd, "r")) == 0){ + EPR "%s: input setup error: %s\n", argv0, strerror(errno)); +#else /* PLAN9 */ + if(Binit(&b, fd, OREAD) < 0){ + EPR "%s: input setup error: %r\n", argv0); +#endif /* PLAN9 */ + EXIT(1, "input error"); + } + r = runes; + for(;;) +#ifndef PLAN9 + switch(l = getisorune(fp)) +#else /* PLAN9 */ + switch(l = getisorune(&b)) +#endif /* PLAN9 */ + { + case -1: + goto done; + case -2: + if(squawk) + EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput); + if(clean) + continue; + nerrors++; + l = Runeerror; + default: + *r++ = l; + if(r >= &runes[N]){ + OUT(out, runes, r-runes); + r = runes; + } + } +done: + if(r > runes) + OUT(out, runes, r-runes); +} + +void +isoutf_out(Rune *base, int n, long *notused) +{ + char *p; + Rune *r; + + USED(notused); + nrunes += n; + for(r = base, p = obuf; n-- > 0; r++) + p += runetoisoutf(p, r); + noutput += p-obuf; + write(1, obuf, p-obuf); +} + +long +#ifndef PLAN9 +getrune(FILE *fp) +#else /* PLAN9 */ +getrune(Biobuf *bp) +#endif /* PLAN9 */ +{ + int c, i; + char str[UTFmax]; /* MB_LEN_MAX really */ + unsigned long l; + int n; + + for(i = 0;;){ +#ifndef PLAN9 + c = getc(fp); +#else /* PLAN9 */ + c = Bgetc(bp); +#endif /* PLAN9 */ + if(c < 0) + return(c); + ninput++; + str[i++] = c; + n = our_mbtowc(&l, str, i); + if(n == -1) + return(-2); + if(n > 0) + return(l); + } +} + +long +#ifndef PLAN9 +getisorune(FILE *fp) +#else /* PLAN9 */ +getisorune(Biobuf *bp) +#endif /* PLAN9 */ +{ + int c, i; + Rune rune; + char str[UTFmax]; /* MB_LEN_MAX really */ + + for(i = 0;;){ +#ifndef PLAN9 + c = getc(fp); +#else /* PLAN9 */ + c = Bgetc(bp); +#endif /* PLAN9 */ + if(c < 0) + return(c); + ninput++; + str[i++] = c; + if(fullisorune(str, i)) + break; + } + isochartorune(&rune, str); + if(rune == Runeerror) + return -2; + return(rune); +} + +enum +{ + Char1 = Runeself, Rune1 = Runeself, + Char21 = 0xA1, Rune21 = 0x0100, + Char22 = 0xF6, Rune22 = 0x4016, + Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */ + Esc = 0xBE, Bad = Runeerror +}; + +static uchar U[256]; +static uchar T[256]; + +static +void +mktable(void) +{ + int i, u; + + for(i=0; i<256; i++) { + u = i + (0x5E - 0xA0); + if(i < 0xA0) + u = i + (0xDF - 0x7F); + if(i < 0x7F) + u = i + (0x00 - 0x21); + if(i < 0x21) + u = i + (0xBE - 0x00); + U[i] = u; + T[u] = i; + } +} + +int +isochartorune(Rune *rune, char *str) +{ + int c, c1, c2; + long l; + + if(U[0] == 0) + mktable(); + + /* + * one character sequence + * 00000-0009F => 00-9F + */ + c = *(uchar*)str; + if(c < Char1) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 000A0-000FF => A0; A0-FF + */ + c1 = *(uchar*)(str+1); + if(c < Char21) { + if(c1 >= Rune1 && c1 < Rune21) { + *rune = c1; + return 2; + } + goto bad; + } + + /* + * two character sequence + * 00100-04015 => A1-F5; 21-7E/A0-FF + */ + c1 = U[c1]; + if(c1 >= Esc) + goto bad; + if(c < Char22) { + *rune = (c-Char21)*Esc + c1 + Rune21; + return 2; + } + + /* + * three character sequence + * 04016-38E2D => A6-FB; 21-7E/A0-FF + */ + c2 = U[*(uchar*)(str+2)]; + if(c2 >= Esc) + goto bad; + if(c < Char3) { + l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22; + if(l >= Rune3) + goto bad; + *rune = l; + return 3; + } + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +runetoisoutf(char *str, Rune *rune) +{ + long c; + + if(T[0] == 0) + mktable(); + + /* + * one character sequence + * 00000-0009F => 00-9F + */ + c = *rune; + if(c < Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 000A0-000FF => A0; A0-FF + */ + if(c < Rune21) { + str[0] = (uchar)Char1; + str[1] = c; + return 2; + } + + /* + * two character sequence + * 00100-04015 => A1-F5; 21-7E/A0-FF + */ + if(c < Rune22) { + c -= Rune21; + str[0] = c/Esc + Char21; + str[1] = T[c%Esc]; + return 2; + } + + /* + * three character sequence + * 04016-38E2D => A6-FB; 21-7E/A0-FF + */ + c -= Rune22; + str[0] = c/(Esc*Esc) + Char22; + str[1] = T[c/Esc%Esc]; + str[2] = T[c%Esc]; + return 3; +} + +int +fullisorune(char *str, int n) +{ + int c; + + if(n > 0) { + c = *(uchar*)str; + if(c < Char1) + return 1; + if(n > 1) + if(c < Char22 || n > 2) + return 1; + } + return 0; +} + +#ifdef PLAN9 +int errno; +#endif + +enum +{ + T1 = 0x00, + Tx = 0x80, + T2 = 0xC0, + T3 = 0xE0, + T4 = 0xF0, + T5 = 0xF8, + T6 = 0xFC, + + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + Bit6 = 2, + + Mask1 = (1<<Bit1)-1, + Maskx = (1<<Bitx)-1, + Mask2 = (1<<Bit2)-1, + Mask3 = (1<<Bit3)-1, + Mask4 = (1<<Bit4)-1, + Mask5 = (1<<Bit5)-1, + Mask6 = (1<<Bit6)-1, + + Wchar1 = (1UL<<Bit1)-1, + Wchar2 = (1UL<<(Bit2+Bitx))-1, + Wchar3 = (1UL<<(Bit3+2*Bitx))-1, + Wchar4 = (1UL<<(Bit4+3*Bitx))-1, + Wchar5 = (1UL<<(Bit5+4*Bitx))-1 + +#ifndef EILSEQ + , /* we hate ansi c's comma rules */ + EILSEQ = 123 +#endif /* PLAN9 */ +}; + +int +our_wctomb(char *s, unsigned long wc) +{ + if(s == 0) + return 0; /* no shift states */ + if(wc & ~Wchar2) { + if(wc & ~Wchar4) { + if(wc & ~Wchar5) { + /* 6 bytes */ + s[0] = T6 | ((wc >> 5*Bitx) & Mask6); + s[1] = Tx | ((wc >> 4*Bitx) & Maskx); + s[2] = Tx | ((wc >> 3*Bitx) & Maskx); + s[3] = Tx | ((wc >> 2*Bitx) & Maskx); + s[4] = Tx | ((wc >> 1*Bitx) & Maskx); + s[5] = Tx | (wc & Maskx); + return 6; + } + /* 5 bytes */ + s[0] = T5 | (wc >> 4*Bitx); + s[1] = Tx | ((wc >> 3*Bitx) & Maskx); + s[2] = Tx | ((wc >> 2*Bitx) & Maskx); + s[3] = Tx | ((wc >> 1*Bitx) & Maskx); + s[4] = Tx | (wc & Maskx); + return 5; + } + if(wc & ~Wchar3) { + /* 4 bytes */ + s[0] = T4 | (wc >> 3*Bitx); + s[1] = Tx | ((wc >> 2*Bitx) & Maskx); + s[2] = Tx | ((wc >> 1*Bitx) & Maskx); + s[3] = Tx | (wc & Maskx); + return 4; + } + /* 3 bytes */ + s[0] = T3 | (wc >> 2*Bitx); + s[1] = Tx | ((wc >> 1*Bitx) & Maskx); + s[2] = Tx | (wc & Maskx); + return 3; + } + if(wc & ~Wchar1) { + /* 2 bytes */ + s[0] = T2 | (wc >> 1*Bitx); + s[1] = Tx | (wc & Maskx); + return 2; + } + /* 1 byte */ + s[0] = T1 | wc; + return 1; +} + +int +our_mbtowc(unsigned long *p, char *s, unsigned n) +{ + uchar *us; + int c0, c1, c2, c3, c4, c5; + unsigned long wc; + + if(s == 0) + return 0; /* no shift states */ + + if(n < 1) + goto badlen; + us = (uchar*)s; + c0 = us[0]; + if(c0 >= T3) { + if(n < 3) + goto badlen; + c1 = us[1] ^ Tx; + c2 = us[2] ^ Tx; + if((c1|c2) & T2) + goto bad; + if(c0 >= T5) { + if(n < 5) + goto badlen; + c3 = us[3] ^ Tx; + c4 = us[4] ^ Tx; + if((c3|c4) & T2) + goto bad; + if(c0 >= T6) { + /* 6 bytes */ + if(n < 6) + goto badlen; + c5 = us[5] ^ Tx; + if(c5 & T2) + goto bad; + wc = ((((((((((c0 & Mask6) << Bitx) | + c1) << Bitx) | c2) << Bitx) | + c3) << Bitx) | c4) << Bitx) | c5; + if(wc <= Wchar5) + goto bad; + *p = wc; + return 6; + } + /* 5 bytes */ + wc = ((((((((c0 & Mask5) << Bitx) | + c1) << Bitx) | c2) << Bitx) | + c3) << Bitx) | c4; + if(wc <= Wchar4) + goto bad; + *p = wc; + return 5; + } + if(c0 >= T4) { + /* 4 bytes */ + if(n < 4) + goto badlen; + c3 = us[3] ^ Tx; + if(c3 & T2) + goto bad; + wc = ((((((c0 & Mask4) << Bitx) | + c1) << Bitx) | c2) << Bitx) | + c3; + if(wc <= Wchar3) + goto bad; + *p = wc; + return 4; + } + /* 3 bytes */ + wc = ((((c0 & Mask3) << Bitx) | + c1) << Bitx) | c2; + if(wc <= Wchar2) + goto bad; + *p = wc; + return 3; + } + if(c0 >= T2) { + /* 2 bytes */ + if(n < 2) + goto badlen; + c1 = us[1] ^ Tx; + if(c1 & T2) + goto bad; + wc = ((c0 & Mask2) << Bitx) | + c1; + if(wc <= Wchar1) + goto bad; + *p = wc; + return 2; + } + /* 1 byte */ + if(c0 >= Tx) + goto bad; + *p = c0; + return 1; + +bad: + errno = EILSEQ; + return -1; +badlen: + return -2; +} |