| #ifdef PLAN9 |
| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #else |
| #include <stdio.h> |
| #include <unistd.h> |
| #include "plan9.h" |
| #endif |
| #include "hdr.h" |
| #include "conv.h" |
| #include "kuten208.h" |
| #include "jis.h" |
| |
| /* |
| a state machine for interpreting all sorts of encodings |
| */ |
| static void |
| alljis(int c, Rune **r, long input_loc) |
| { |
| static enum { state0, state1, state2, state3, state4 } state = state0; |
| static int set8 = 0; |
| static int japan646 = 0; |
| static int lastc; |
| int n; |
| long l; |
| |
| again: |
| switch(state) |
| { |
| case state0: /* idle state */ |
| if(c == ESC){ state = state1; return; } |
| if(c < 0) return; |
| if(!set8 && (c < 128)){ |
| if(japan646){ |
| switch(c) |
| { |
| case '\\': emit(0xA5); return; /* yen */ |
| case '~': emit(0xAF); return; /* spacing macron */ |
| default: emit(c); return; |
| } |
| } else { |
| emit(c); |
| return; |
| } |
| } |
| if(c < 0x21){ /* guard against bogus characters in JIS mode */ |
| if(squawk) |
| EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc); |
| emit(c); |
| return; |
| } |
| lastc = c; state = state4; return; |
| |
| case state1: /* seen an escape */ |
| if(c == '$'){ state = state2; return; } |
| if(c == '('){ state = state3; return; } |
| emit(ESC); state = state0; goto again; |
| |
| case state2: /* may be shifting into JIS */ |
| if((c == '@') || (c == 'B')){ |
| set8 = 1; state = state0; return; |
| } |
| emit(ESC); emit('$'); state = state0; goto again; |
| |
| case state3: /* may be shifting out of JIS */ |
| if((c == 'J') || (c == 'H') || (c == 'B')){ |
| japan646 = (c == 'J'); |
| set8 = 0; state = state0; return; |
| } |
| emit(ESC); emit('('); state = state0; goto again; |
| |
| case state4: /* two part char */ |
| if(c < 0){ |
| if(squawk) |
| EPR "%s: unexpected EOF in %s\n", argv0, file); |
| c = 0x21 | (lastc&0x80); |
| } |
| if(CANS2J(lastc, c)){ /* ms dos sjis */ |
| int hi = lastc, lo = c; |
| S2J(hi, lo); /* convert to 208 */ |
| n = hi*100 + lo - 3232; /* convert to kuten208 */ |
| } else |
| n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ |
| if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| } else { |
| if(l < 0){ |
| l = -l; |
| if(squawk) |
| EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); |
| } |
| emit(l); |
| } |
| state = state0; |
| } |
| } |
| |
| /* |
| a state machine for interpreting ms-kanji == shift-jis. |
| */ |
| static void |
| ms(int c, Rune **r, long input_loc) |
| { |
| static enum { state0, state1, state2, state3, state4 } state = state0; |
| static int set8 = 0; |
| static int japan646 = 0; |
| static int lastc; |
| int n; |
| long l; |
| |
| again: |
| switch(state) |
| { |
| case state0: /* idle state */ |
| if(c == ESC){ state = state1; return; } |
| if(c < 0) return; |
| if(!set8 && (c < 128)){ |
| if(japan646){ |
| switch(c) |
| { |
| case '\\': emit(0xA5); return; /* yen */ |
| case '~': emit(0xAF); return; /* spacing macron */ |
| default: emit(c); return; |
| } |
| } else { |
| emit(c); |
| return; |
| } |
| } |
| lastc = c; state = state4; return; |
| |
| case state1: /* seen an escape */ |
| if(c == '$'){ state = state2; return; } |
| if(c == '('){ state = state3; return; } |
| emit(ESC); state = state0; goto again; |
| |
| case state2: /* may be shifting into JIS */ |
| if((c == '@') || (c == 'B')){ |
| set8 = 1; state = state0; return; |
| } |
| emit(ESC); emit('$'); state = state0; goto again; |
| |
| case state3: /* may be shifting out of JIS */ |
| if((c == 'J') || (c == 'H') || (c == 'B')){ |
| japan646 = (c == 'J'); |
| set8 = 0; state = state0; return; |
| } |
| emit(ESC); emit('('); state = state0; goto again; |
| |
| case state4: /* two part char */ |
| if(c < 0){ |
| if(squawk) |
| EPR "%s: unexpected EOF in %s\n", argv0, file); |
| c = 0x21 | (lastc&0x80); |
| } |
| if(CANS2J(lastc, c)){ /* ms dos sjis */ |
| int hi = lastc, lo = c; |
| S2J(hi, lo); /* convert to 208 */ |
| n = hi*100 + lo - 3232; /* convert to kuten208 */ |
| } else { |
| nerrors++; |
| if(squawk) |
| EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| state = state0; |
| goto again; |
| } |
| if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| } else { |
| if(l < 0){ |
| l = -l; |
| if(squawk) |
| EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); |
| } |
| emit(l); |
| } |
| state = state0; |
| } |
| } |
| |
| /* |
| a state machine for interpreting ujis == EUC |
| */ |
| static void |
| ujis(int c, Rune **r, long input_loc) |
| { |
| static enum { state0, state1 } state = state0; |
| static int lastc; |
| int n; |
| long l; |
| |
| switch(state) |
| { |
| case state0: /* idle state */ |
| if(c < 0) return; |
| if(c < 128){ |
| emit(c); |
| return; |
| } |
| if(c == 0x8e){ /* codeset 2 */ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| return; |
| } |
| if(c == 0x8f){ /* codeset 3 */ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| return; |
| } |
| lastc = c; |
| state = state1; |
| return; |
| |
| case state1: /* two part char */ |
| if(c < 0){ |
| if(squawk) |
| EPR "%s: unexpected EOF in %s\n", argv0, file); |
| c = 0xA1; |
| } |
| n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */ |
| if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| } else { |
| if(l < 0){ |
| l = -l; |
| if(squawk) |
| EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); |
| } |
| emit(l); |
| } |
| state = state0; |
| } |
| } |
| |
| /* |
| a state machine for interpreting jis-kanji == 2022-JP |
| */ |
| static void |
| jis(int c, Rune **r, long input_loc) |
| { |
| static enum { state0, state1, state2, state3, state4 } state = state0; |
| static int set8 = 0; |
| static int japan646 = 0; |
| static int lastc; |
| int n; |
| long l; |
| |
| again: |
| switch(state) |
| { |
| case state0: /* idle state */ |
| if(c == ESC){ state = state1; return; } |
| if(c < 0) return; |
| if(!set8 && (c < 128)){ |
| if(japan646){ |
| switch(c) |
| { |
| case '\\': emit(0xA5); return; /* yen */ |
| case '~': emit(0xAF); return; /* spacing macron */ |
| default: emit(c); return; |
| } |
| } else { |
| emit(c); |
| return; |
| } |
| } |
| lastc = c; state = state4; return; |
| |
| case state1: /* seen an escape */ |
| if(c == '$'){ state = state2; return; } |
| if(c == '('){ state = state3; return; } |
| emit(ESC); state = state0; goto again; |
| |
| case state2: /* may be shifting into JIS */ |
| if((c == '@') || (c == 'B')){ |
| set8 = 1; state = state0; return; |
| } |
| emit(ESC); emit('$'); state = state0; goto again; |
| |
| case state3: /* may be shifting out of JIS */ |
| if((c == 'J') || (c == 'H') || (c == 'B')){ |
| japan646 = (c == 'J'); |
| set8 = 0; state = state0; return; |
| } |
| emit(ESC); emit('('); state = state0; goto again; |
| |
| case state4: /* two part char */ |
| if(c < 0){ |
| if(squawk) |
| EPR "%s: unexpected EOF in %s\n", argv0, file); |
| c = 0x21 | (lastc&0x80); |
| } |
| if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */ |
| emit(lastc); |
| state = state0; |
| goto again; |
| } |
| n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ |
| if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ |
| nerrors++; |
| if(squawk) |
| EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); |
| if(!clean) |
| emit(BADMAP); |
| } else { |
| if(l < 0){ |
| l = -l; |
| if(squawk) |
| EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); |
| } |
| emit(l); |
| } |
| state = state0; |
| } |
| } |
| |
| static void |
| do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out) |
| { |
| Rune ob[N]; |
| Rune *r, *re; |
| uchar ibuf[N]; |
| int n, i; |
| long nin; |
| |
| r = ob; |
| re = ob+N-3; |
| nin = 0; |
| while((n = read(fd, ibuf, sizeof ibuf)) > 0){ |
| for(i = 0; i < n; i++){ |
| (*procfn)(ibuf[i], &r, nin++); |
| if(r >= re){ |
| OUT(out, ob, r-ob); |
| r = ob; |
| } |
| } |
| if(r > ob){ |
| OUT(out, ob, r-ob); |
| r = ob; |
| } |
| } |
| (*procfn)(-1, &r, nin); |
| if(r > ob) |
| OUT(out, ob, r-ob); |
| OUT(out, ob, 0); |
| } |
| |
| void |
| jis_in(int fd, long *notused, struct convert *out) |
| { |
| USED(notused); |
| do_in(fd, alljis, out); |
| } |
| |
| void |
| ujis_in(int fd, long *notused, struct convert *out) |
| { |
| USED(notused); |
| do_in(fd, ujis, out); |
| } |
| |
| void |
| msjis_in(int fd, long *notused, struct convert *out) |
| { |
| USED(notused); |
| do_in(fd, ms, out); |
| } |
| |
| void |
| jisjis_in(int fd, long *notused, struct convert *out) |
| { |
| USED(notused); |
| do_in(fd, jis, out); |
| } |
| |
| static int first = 1; |
| |
| static void |
| tab_init(void) |
| { |
| int i; |
| long l; |
| |
| first = 0; |
| for(i = 0; i < NRUNE; i++) |
| tab[i] = -1; |
| for(i = 0; i < KUTEN208MAX; i++) |
| if((l = tabkuten208[i]) != -1){ |
| if(l < 0) |
| tab[-l] = i; |
| else |
| tab[l] = i; |
| } |
| } |
| |
| |
| /* jis-kanji, or ISO 2022-JP */ |
| void |
| jisjis_out(Rune *base, int n, long *notused) |
| { |
| char *p; |
| int i; |
| Rune r; |
| static enum { ascii, japan646, jp2022 } state = ascii; |
| |
| USED(notused); |
| if(first) |
| tab_init(); |
| nrunes += n; |
| p = obuf; |
| for(i = 0; i < n; i++){ |
| r = base[i]; |
| if(r < 128){ |
| if(state == jp2022){ |
| *p++ = ESC; *p++ = '('; *p++ = 'B'; |
| state = ascii; |
| } |
| *p++ = r; |
| } else { |
| if(tab[r] != -1){ |
| if(state != jp2022){ |
| *p++ = ESC; *p++ = '$'; *p++ = 'B'; |
| state = jp2022; |
| } |
| *p++ = tab[r]/100 + ' '; |
| *p++ = tab[r]%100 + ' '; |
| continue; |
| } |
| if(squawk) |
| EPR "%s: rune 0x%x not in output cs\n", argv0, r); |
| nerrors++; |
| if(clean) |
| continue; |
| *p++ = BYTEBADMAP; |
| } |
| } |
| noutput += p-obuf; |
| if(p > obuf) |
| write(1, obuf, p-obuf); |
| } |
| |
| /* ms-kanji, or Shift-JIS */ |
| void |
| msjis_out(Rune *base, int n, long *notused) |
| { |
| char *p; |
| int i, hi, lo; |
| Rune r; |
| |
| USED(notused); |
| if(first) |
| tab_init(); |
| nrunes += n; |
| p = obuf; |
| for(i = 0; i < n; i++){ |
| r = base[i]; |
| if(r < 128) |
| *p++ = r; |
| else { |
| if(tab[r] != -1){ |
| hi = tab[r]/100 + ' '; |
| lo = tab[r]%100 + ' '; |
| J2S(hi, lo); |
| *p++ = hi; |
| *p++ = lo; |
| continue; |
| } |
| if(squawk) |
| EPR "%s: rune 0x%x not in output cs\n", argv0, r); |
| nerrors++; |
| if(clean) |
| continue; |
| *p++ = BYTEBADMAP; |
| } |
| } |
| noutput += p-obuf; |
| if(p > obuf) |
| write(1, obuf, p-obuf); |
| } |
| |
| /* ujis, or EUC */ |
| void |
| ujis_out(Rune *base, int n, long *notused) |
| { |
| char *p; |
| int i; |
| Rune r; |
| |
| USED(notused); |
| if(first) |
| tab_init(); |
| nrunes += n; |
| p = obuf; |
| for(i = 0; i < n; i++){ |
| r = base[i]; |
| if(r < 128) |
| *p++ = r; |
| else { |
| if(tab[r] != -1){ |
| *p++ = 0x80 | (tab[r]/100 + ' '); |
| *p++ = 0x80 | (tab[r]%100 + ' '); |
| continue; |
| } |
| if(squawk) |
| EPR "%s: rune 0x%x not in output cs\n", argv0, r); |
| nerrors++; |
| if(clean) |
| continue; |
| *p++ = BYTEBADMAP; |
| } |
| } |
| noutput += p-obuf; |
| if(p > obuf) |
| write(1, obuf, p-obuf); |
| } |