| #ifdef PLAN9 |
| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #ifdef PLAN9PORT |
| #include <errno.h> |
| #else |
| extern int errno; |
| #endif |
| #else |
| #include <sys/types.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| #include <errno.h> |
| #include "plan9.h" |
| #endif |
| #include "hdr.h" |
| #ifndef EILSEQ |
| #define EILSEQ 9998 |
| #endif |
| |
| /* |
| the our_* routines are implementations for the corresponding library |
| routines. for a while, i tried to actually name them wctomb etc |
| but stopped that after i found a system which made wchar_t an |
| unsigned char. |
| */ |
| |
| int our_wctomb(char *s, unsigned long wc); |
| int our_mbtowc(unsigned long *p, char *s, unsigned n); |
| int runetoisoutf(char *str, Rune *rune); |
| int fullisorune(char *str, int n); |
| int isochartorune(Rune *rune, char *str); |
| |
| void |
| utf_in(int fd, long *notused, struct convert *out) |
| { |
| char buf[N]; |
| int i, j, c, n, tot; |
| ulong l; |
| |
| USED(notused); |
| tot = 0; |
| while((n = read(fd, buf+tot, N-tot)) >= 0){ |
| tot += n; |
| for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ |
| c = our_mbtowc(&l, buf+i, tot-i); |
| if(c == -1){ |
| if(squawk) |
| EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); |
| if(clean){ |
| i++; |
| continue; |
| } |
| nerrors++; |
| l = Runeerror; |
| c = 1; |
| } |
| runes[j++] = l; |
| i += c; |
| } |
| OUT(out, runes, j); |
| tot -= i; |
| ninput += i; |
| if(tot) |
| memmove(buf, buf+i, tot); |
| if(n == 0) |
| break; |
| } |
| OUT(out, runes, 0); |
| } |
| |
| void |
| utf_out(Rune *base, int n, long *notused) |
| { |
| char *p; |
| Rune *r; |
| |
| USED(notused); |
| nrunes += n; |
| for(r = base, p = obuf; n-- > 0; r++){ |
| p += our_wctomb(p, *r); |
| } |
| noutput += p-obuf; |
| write(1, obuf, p-obuf); |
| } |
| |
| void |
| isoutf_in(int fd, long *notused, struct convert *out) |
| { |
| char buf[N]; |
| int i, j, c, n, tot; |
| |
| USED(notused); |
| tot = 0; |
| while((n = read(fd, buf+tot, N-tot)) >= 0){ |
| tot += n; |
| for(i=j=0; i<tot; ){ |
| if(!fullisorune(buf+i, tot-i)) |
| break; |
| c = isochartorune(&runes[j], buf+i); |
| if(runes[j] == Runeerror && c == 1){ |
| if(squawk) |
| EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); |
| if(clean){ |
| i++; |
| continue; |
| } |
| nerrors++; |
| } |
| j++; |
| i += c; |
| } |
| OUT(out, runes, j); |
| tot -= i; |
| ninput += i; |
| if(tot) |
| memmove(buf, buf+i, tot); |
| if(n == 0) |
| break; |
| } |
| OUT(out, runes, 0); |
| } |
| |
| void |
| isoutf_out(Rune *base, int n, long *notused) |
| { |
| char *p; |
| Rune *r; |
| |
| USED(notused); |
| nrunes += n; |
| for(r = base, p = obuf; n-- > 0; r++) |
| p += runetoisoutf(p, r); |
| noutput += p-obuf; |
| write(1, obuf, p-obuf); |
| } |
| |
| |
| int |
| isochartorune(Rune *rune, char *str) |
| { |
| return chartorune(rune, str); |
| } |
| |
| int |
| runetoisoutf(char *str, Rune *rune) |
| { |
| return runetochar(str, rune); |
| } |
| |
| int |
| fullisorune(char *str, int n) |
| { |
| return fullrune(str, n); |
| } |
| |
| enum |
| { |
| T1 = 0x00, |
| Tx = 0x80, |
| T2 = 0xC0, |
| T3 = 0xE0, |
| T4 = 0xF0, |
| T5 = 0xF8, |
| T6 = 0xFC, |
| |
| Bit1 = 7, |
| Bitx = 6, |
| Bit2 = 5, |
| Bit3 = 4, |
| Bit4 = 3, |
| Bit5 = 2, |
| Bit6 = 2, |
| |
| Mask1 = (1<<Bit1)-1, |
| Maskx = (1<<Bitx)-1, |
| Mask2 = (1<<Bit2)-1, |
| Mask3 = (1<<Bit3)-1, |
| Mask4 = (1<<Bit4)-1, |
| Mask5 = (1<<Bit5)-1, |
| Mask6 = (1<<Bit6)-1, |
| |
| Wchar1 = (1UL<<Bit1)-1, |
| Wchar2 = (1UL<<(Bit2+Bitx))-1, |
| Wchar3 = (1UL<<(Bit3+2*Bitx))-1, |
| Wchar4 = (1UL<<(Bit4+3*Bitx))-1, |
| Wchar5 = (1UL<<(Bit5+4*Bitx))-1 |
| }; |
| |
| int |
| our_wctomb(char *s, unsigned long wc) |
| { |
| if(s == 0) |
| return 0; /* no shift states */ |
| if(wc & ~Wchar2) { |
| if(wc & ~Wchar4) { |
| if(wc & ~Wchar5) { |
| /* 6 bytes */ |
| s[0] = T6 | ((wc >> 5*Bitx) & Mask6); |
| s[1] = Tx | ((wc >> 4*Bitx) & Maskx); |
| s[2] = Tx | ((wc >> 3*Bitx) & Maskx); |
| s[3] = Tx | ((wc >> 2*Bitx) & Maskx); |
| s[4] = Tx | ((wc >> 1*Bitx) & Maskx); |
| s[5] = Tx | (wc & Maskx); |
| return 6; |
| } |
| /* 5 bytes */ |
| s[0] = T5 | (wc >> 4*Bitx); |
| s[1] = Tx | ((wc >> 3*Bitx) & Maskx); |
| s[2] = Tx | ((wc >> 2*Bitx) & Maskx); |
| s[3] = Tx | ((wc >> 1*Bitx) & Maskx); |
| s[4] = Tx | (wc & Maskx); |
| return 5; |
| } |
| if(wc & ~Wchar3) { |
| /* 4 bytes */ |
| s[0] = T4 | (wc >> 3*Bitx); |
| s[1] = Tx | ((wc >> 2*Bitx) & Maskx); |
| s[2] = Tx | ((wc >> 1*Bitx) & Maskx); |
| s[3] = Tx | (wc & Maskx); |
| return 4; |
| } |
| /* 3 bytes */ |
| s[0] = T3 | (wc >> 2*Bitx); |
| s[1] = Tx | ((wc >> 1*Bitx) & Maskx); |
| s[2] = Tx | (wc & Maskx); |
| return 3; |
| } |
| if(wc & ~Wchar1) { |
| /* 2 bytes */ |
| s[0] = T2 | (wc >> 1*Bitx); |
| s[1] = Tx | (wc & Maskx); |
| return 2; |
| } |
| /* 1 byte */ |
| s[0] = T1 | wc; |
| return 1; |
| } |
| |
| int |
| our_mbtowc(unsigned long *p, char *s, unsigned n) |
| { |
| uchar *us; |
| int c0, c1, c2, c3, c4, c5; |
| unsigned long wc; |
| |
| if(s == 0) |
| return 0; /* no shift states */ |
| |
| if(n < 1) |
| goto bad; |
| us = (uchar*)s; |
| c0 = us[0]; |
| if(c0 >= T3) { |
| if(n < 3) |
| goto bad; |
| c1 = us[1] ^ Tx; |
| c2 = us[2] ^ Tx; |
| if((c1|c2) & T2) |
| goto bad; |
| if(c0 >= T5) { |
| if(n < 5) |
| goto bad; |
| c3 = us[3] ^ Tx; |
| c4 = us[4] ^ Tx; |
| if((c3|c4) & T2) |
| goto bad; |
| if(c0 >= T6) { |
| /* 6 bytes */ |
| if(n < 6) |
| goto bad; |
| c5 = us[5] ^ Tx; |
| if(c5 & T2) |
| goto bad; |
| wc = ((((((((((c0 & Mask6) << Bitx) | |
| c1) << Bitx) | c2) << Bitx) | |
| c3) << Bitx) | c4) << Bitx) | c5; |
| if(wc <= Wchar5) |
| goto bad; |
| *p = wc; |
| return 6; |
| } |
| /* 5 bytes */ |
| wc = ((((((((c0 & Mask5) << Bitx) | |
| c1) << Bitx) | c2) << Bitx) | |
| c3) << Bitx) | c4; |
| if(wc <= Wchar4) |
| goto bad; |
| *p = wc; |
| return 5; |
| } |
| if(c0 >= T4) { |
| /* 4 bytes */ |
| if(n < 4) |
| goto bad; |
| c3 = us[3] ^ Tx; |
| if(c3 & T2) |
| goto bad; |
| wc = ((((((c0 & Mask4) << Bitx) | |
| c1) << Bitx) | c2) << Bitx) | |
| c3; |
| if(wc <= Wchar3) |
| goto bad; |
| *p = wc; |
| return 4; |
| } |
| /* 3 bytes */ |
| wc = ((((c0 & Mask3) << Bitx) | |
| c1) << Bitx) | c2; |
| if(wc <= Wchar2) |
| goto bad; |
| *p = wc; |
| return 3; |
| } |
| if(c0 >= T2) { |
| /* 2 bytes */ |
| if(n < 2) |
| goto bad; |
| c1 = us[1] ^ Tx; |
| if(c1 & T2) |
| goto bad; |
| wc = ((c0 & Mask2) << Bitx) | |
| c1; |
| if(wc <= Wchar1) |
| goto bad; |
| *p = wc; |
| return 2; |
| } |
| /* 1 byte */ |
| if(c0 >= Tx) |
| goto bad; |
| *p = c0; |
| return 1; |
| |
| bad: |
| errno = EILSEQ; |
| return -1; |
| } |