diff options
Diffstat (limited to 'src/cmd/tcs/conv_jis.c')
-rw-r--r-- | src/cmd/tcs/conv_jis.c | 535 |
1 files changed, 535 insertions, 0 deletions
diff --git a/src/cmd/tcs/conv_jis.c b/src/cmd/tcs/conv_jis.c new file mode 100644 index 00000000..18579d70 --- /dev/null +++ b/src/cmd/tcs/conv_jis.c @@ -0,0 +1,535 @@ +#ifdef PLAN9 +#include <u.h> +#include <libc.h> +#include <bio.h> +#else +#include <stdio.h> +#include <unistd.h> +#include "plan9.h" +#endif +#include "hdr.h" +#include "conv.h" +#include "kuten208.h" +#include "jis.h" + +/* + a state machine for interpreting all sorts of encodings +*/ +static void +alljis(int c, Rune **r, long input_loc) +{ + static enum { state0, state1, state2, state3, state4 } state = state0; + static int set8 = 0; + static int japan646 = 0; + static int lastc; + int n; + long l; + +again: + switch(state) + { + case state0: /* idle state */ + if(c == ESC){ state = state1; return; } + if(c < 0) return; + if(!set8 && (c < 128)){ + if(japan646){ + switch(c) + { + case '\\': emit(0xA5); return; /* yen */ + case '~': emit(0xAF); return; /* spacing macron */ + default: emit(c); return; + } + } else { + emit(c); + return; + } + } + if(c < 0x21){ /* guard against bogus characters in JIS mode */ + if(squawk) + EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc); + emit(c); + return; + } + lastc = c; state = state4; return; + + case state1: /* seen an escape */ + if(c == '$'){ state = state2; return; } + if(c == '('){ state = state3; return; } + emit(ESC); state = state0; goto again; + + case state2: /* may be shifting into JIS */ + if((c == '@') || (c == 'B')){ + set8 = 1; state = state0; return; + } + emit(ESC); emit('$'); state = state0; goto again; + + case state3: /* may be shifting out of JIS */ + if((c == 'J') || (c == 'H') || (c == 'B')){ + japan646 = (c == 'J'); + set8 = 0; state = state0; return; + } + emit(ESC); emit('('); state = state0; goto again; + + case state4: /* two part char */ + if(c < 0){ + if(squawk) + EPR "%s: unexpected EOF in %s\n", argv0, file); + c = 0x21 | (lastc&0x80); + } + if(CANS2J(lastc, c)){ /* ms dos sjis */ + int hi = lastc, lo = c; + S2J(hi, lo); /* convert to 208 */ + n = hi*100 + lo - 3232; /* convert to kuten208 */ + } else + n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ + if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ + nerrors++; + if(squawk) + EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); + if(!clean) + emit(BADMAP); + } else { + if(l < 0){ + l = -l; + if(squawk) + EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); + } + emit(l); + } + state = state0; + } +} + +/* + a state machine for interpreting ms-kanji == shift-jis. +*/ +static void +ms(int c, Rune **r, long input_loc) +{ + static enum { state0, state1, state2, state3, state4 } state = state0; + static int set8 = 0; + static int japan646 = 0; + static int lastc; + int n; + long l; + +again: + switch(state) + { + case state0: /* idle state */ + if(c == ESC){ state = state1; return; } + if(c < 0) return; + if(!set8 && (c < 128)){ + if(japan646){ + switch(c) + { + case '\\': emit(0xA5); return; /* yen */ + case '~': emit(0xAF); return; /* spacing macron */ + default: emit(c); return; + } + } else { + emit(c); + return; + } + } + lastc = c; state = state4; return; + + case state1: /* seen an escape */ + if(c == '$'){ state = state2; return; } + if(c == '('){ state = state3; return; } + emit(ESC); state = state0; goto again; + + case state2: /* may be shifting into JIS */ + if((c == '@') || (c == 'B')){ + set8 = 1; state = state0; return; + } + emit(ESC); emit('$'); state = state0; goto again; + + case state3: /* may be shifting out of JIS */ + if((c == 'J') || (c == 'H') || (c == 'B')){ + japan646 = (c == 'J'); + set8 = 0; state = state0; return; + } + emit(ESC); emit('('); state = state0; goto again; + + case state4: /* two part char */ + if(c < 0){ + if(squawk) + EPR "%s: unexpected EOF in %s\n", argv0, file); + c = 0x21 | (lastc&0x80); + } + if(CANS2J(lastc, c)){ /* ms dos sjis */ + int hi = lastc, lo = c; + S2J(hi, lo); /* convert to 208 */ + n = hi*100 + lo - 3232; /* convert to kuten208 */ + } else { + nerrors++; + if(squawk) + EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file); + if(!clean) + emit(BADMAP); + state = state0; + goto again; + } + if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ + nerrors++; + if(squawk) + EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); + if(!clean) + emit(BADMAP); + } else { + if(l < 0){ + l = -l; + if(squawk) + EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); + } + emit(l); + } + state = state0; + } +} + +/* + a state machine for interpreting ujis == EUC +*/ +static void +ujis(int c, Rune **r, long input_loc) +{ + static enum { state0, state1 } state = state0; + static int lastc; + int n; + long l; + + switch(state) + { + case state0: /* idle state */ + if(c < 0) return; + if(c < 128){ + emit(c); + return; + } + if(c == 0x8e){ /* codeset 2 */ + nerrors++; + if(squawk) + EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file); + if(!clean) + emit(BADMAP); + return; + } + if(c == 0x8f){ /* codeset 3 */ + nerrors++; + if(squawk) + EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file); + if(!clean) + emit(BADMAP); + return; + } + lastc = c; + state = state1; + return; + + case state1: /* two part char */ + if(c < 0){ + if(squawk) + EPR "%s: unexpected EOF in %s\n", argv0, file); + c = 0xA1; + } + n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */ + if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ + nerrors++; + if(squawk) + EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); + if(!clean) + emit(BADMAP); + } else { + if(l < 0){ + l = -l; + if(squawk) + EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); + } + emit(l); + } + state = state0; + } +} + +/* + a state machine for interpreting jis-kanji == 2022-JP +*/ +static void +jis(int c, Rune **r, long input_loc) +{ + static enum { state0, state1, state2, state3, state4 } state = state0; + static int set8 = 0; + static int japan646 = 0; + static int lastc; + int n; + long l; + +again: + switch(state) + { + case state0: /* idle state */ + if(c == ESC){ state = state1; return; } + if(c < 0) return; + if(!set8 && (c < 128)){ + if(japan646){ + switch(c) + { + case '\\': emit(0xA5); return; /* yen */ + case '~': emit(0xAF); return; /* spacing macron */ + default: emit(c); return; + } + } else { + emit(c); + return; + } + } + lastc = c; state = state4; return; + + case state1: /* seen an escape */ + if(c == '$'){ state = state2; return; } + if(c == '('){ state = state3; return; } + emit(ESC); state = state0; goto again; + + case state2: /* may be shifting into JIS */ + if((c == '@') || (c == 'B')){ + set8 = 1; state = state0; return; + } + emit(ESC); emit('$'); state = state0; goto again; + + case state3: /* may be shifting out of JIS */ + if((c == 'J') || (c == 'H') || (c == 'B')){ + japan646 = (c == 'J'); + set8 = 0; state = state0; return; + } + emit(ESC); emit('('); state = state0; goto again; + + case state4: /* two part char */ + if(c < 0){ + if(squawk) + EPR "%s: unexpected EOF in %s\n", argv0, file); + c = 0x21 | (lastc&0x80); + } + if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */ + emit(lastc); + state = state0; + goto again; + } + n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ + if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ + nerrors++; + if(squawk) + EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); + if(!clean) + emit(BADMAP); + } else { + if(l < 0){ + l = -l; + if(squawk) + EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); + } + emit(l); + } + state = state0; + } +} + +static void +do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out) +{ + Rune ob[N]; + Rune *r, *re; + uchar ibuf[N]; + int n, i; + long nin; + + r = ob; + re = ob+N-3; + nin = 0; + while((n = read(fd, ibuf, sizeof ibuf)) > 0){ + for(i = 0; i < n; i++){ + (*procfn)(ibuf[i], &r, nin++); + if(r >= re){ + OUT(out, ob, r-ob); + r = ob; + } + } + if(r > ob){ + OUT(out, ob, r-ob); + r = ob; + } + } + (*procfn)(-1, &r, nin); + if(r > ob) + OUT(out, ob, r-ob); +} + +void +jis_in(int fd, long *notused, struct convert *out) +{ + USED(notused); + do_in(fd, alljis, out); +} + +void +ujis_in(int fd, long *notused, struct convert *out) +{ + USED(notused); + do_in(fd, ujis, out); +} + +void +msjis_in(int fd, long *notused, struct convert *out) +{ + USED(notused); + do_in(fd, ms, out); +} + +void +jisjis_in(int fd, long *notused, struct convert *out) +{ + USED(notused); + do_in(fd, jis, out); +} + +static int first = 1; + +static void +tab_init(void) +{ + int i; + long l; + + first = 0; + for(i = 0; i < NRUNE; i++) + tab[i] = -1; + for(i = 0; i < KUTEN208MAX; i++) + if((l = tabkuten208[i]) != -1){ + if(l < 0) + tab[-l] = i; + else + tab[l] = i; + } +} + + +/* jis-kanji, or ISO 2022-JP */ +void +jisjis_out(Rune *base, int n, long *notused) +{ + char *p; + int i; + Rune r; + static enum { ascii, japan646, jp2022 } state = ascii; + + USED(notused); + if(first) + tab_init(); + nrunes += n; + p = obuf; + for(i = 0; i < n; i++){ + r = base[i]; + if(r < 128){ + if(state == jp2022){ + *p++ = ESC; *p++ = '('; *p++ = 'B'; + state = ascii; + } + *p++ = r; + } else { + if(tab[r] != -1){ + if(state != jp2022){ + *p++ = ESC; *p++ = '$'; *p++ = 'B'; + state = jp2022; + } + *p++ = tab[r]/100 + ' '; + *p++ = tab[r]%100 + ' '; + continue; + } + if(squawk) + EPR "%s: rune 0x%x not in output cs\n", argv0, r); + nerrors++; + if(clean) + continue; + *p++ = BYTEBADMAP; + } + } + noutput += p-obuf; + if(p > obuf) + write(1, obuf, p-obuf); +} + +/* ms-kanji, or Shift-JIS */ +void +msjis_out(Rune *base, int n, long *notused) +{ + char *p; + int i, hi, lo; + Rune r; + + USED(notused); + if(first) + tab_init(); + nrunes += n; + p = obuf; + for(i = 0; i < n; i++){ + r = base[i]; + if(r < 128) + *p++ = r; + else { + if(tab[r] != -1){ + hi = tab[r]/100 + ' '; + lo = tab[r]%100 + ' '; + J2S(hi, lo); + *p++ = hi; + *p++ = lo; + continue; + } + if(squawk) + EPR "%s: rune 0x%x not in output cs\n", argv0, r); + nerrors++; + if(clean) + continue; + *p++ = BYTEBADMAP; + } + } + noutput += p-obuf; + if(p > obuf) + write(1, obuf, p-obuf); +} + +/* ujis, or EUC */ +void +ujis_out(Rune *base, int n, long *notused) +{ + char *p; + int i; + Rune r; + + USED(notused); + if(first) + tab_init(); + nrunes += n; + p = obuf; + for(i = 0; i < n; i++){ + r = base[i]; + if(r < 128) + *p++ = r; + else { + if(tab[r] != -1){ + *p++ = 0x80 | (tab[r]/100 + ' '); + *p++ = 0x80 | (tab[r]%100 + ' '); + continue; + } + if(squawk) + EPR "%s: rune 0x%x not in output cs\n", argv0, r); + nerrors++; + if(clean) + continue; + *p++ = BYTEBADMAP; + } + } + noutput += p-obuf; + if(p > obuf) + write(1, obuf, p-obuf); +} |