|  | #ifdef PLAN9 | 
|  | #include	<u.h> | 
|  | #include	<libc.h> | 
|  | #include	<bio.h> | 
|  | #ifdef PLAN9PORT | 
|  | #include	<errno.h> | 
|  | #else | 
|  | extern int errno; | 
|  | #endif | 
|  | #else | 
|  | #include	<sys/types.h> | 
|  | #include	<stdio.h> | 
|  | #include	<stdlib.h> | 
|  | #include	<string.h> | 
|  | #include	<unistd.h> | 
|  | #include	<errno.h> | 
|  | #include	"plan9.h" | 
|  | #endif | 
|  | #include	"hdr.h" | 
|  | #ifndef EILSEQ | 
|  | #define EILSEQ 9998 | 
|  | #endif | 
|  |  | 
|  | /* | 
|  | the our_* routines are implementations for the corresponding library | 
|  | routines. for a while, i tried to actually name them wctomb etc | 
|  | but stopped that after i found a system which made wchar_t an | 
|  | unsigned char. | 
|  | */ | 
|  |  | 
|  | int our_wctomb(char *s, unsigned long wc); | 
|  | int our_mbtowc(unsigned long *p, char *s, unsigned n); | 
|  | int runetoisoutf(char *str, Rune *rune); | 
|  | int fullisorune(char *str, int n); | 
|  | int isochartorune(Rune *rune, char *str); | 
|  |  | 
|  | void | 
|  | utf_in(int fd, long *notused, struct convert *out) | 
|  | { | 
|  | char buf[N]; | 
|  | int i, j, c, n, tot; | 
|  | ulong l; | 
|  |  | 
|  | USED(notused); | 
|  | tot = 0; | 
|  | while((n = read(fd, buf+tot, N-tot)) >= 0){ | 
|  | tot += n; | 
|  | for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ | 
|  | c = our_mbtowc(&l, buf+i, tot-i); | 
|  | if(c == -1){ | 
|  | if(squawk) | 
|  | EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); | 
|  | if(clean){ | 
|  | i++; | 
|  | continue; | 
|  | } | 
|  | nerrors++; | 
|  | l = Runeerror; | 
|  | c = 1; | 
|  | } | 
|  | runes[j++] = l; | 
|  | i += c; | 
|  | } | 
|  | OUT(out, runes, j); | 
|  | tot -= i; | 
|  | ninput += i; | 
|  | if(tot) | 
|  | memmove(buf, buf+i, tot); | 
|  | if(n == 0) | 
|  | break; | 
|  | } | 
|  | OUT(out, runes, 0); | 
|  | } | 
|  |  | 
|  | void | 
|  | utf_out(Rune *base, int n, long *notused) | 
|  | { | 
|  | char *p; | 
|  | Rune *r; | 
|  |  | 
|  | USED(notused); | 
|  | nrunes += n; | 
|  | for(r = base, p = obuf; n-- > 0; r++){ | 
|  | p += our_wctomb(p, *r); | 
|  | } | 
|  | noutput += p-obuf; | 
|  | write(1, obuf, p-obuf); | 
|  | } | 
|  |  | 
|  | void | 
|  | isoutf_in(int fd, long *notused, struct convert *out) | 
|  | { | 
|  | char buf[N]; | 
|  | int i, j, c, n, tot; | 
|  |  | 
|  | USED(notused); | 
|  | tot = 0; | 
|  | while((n = read(fd, buf+tot, N-tot)) >= 0){ | 
|  | tot += n; | 
|  | for(i=j=0; i<tot; ){ | 
|  | if(!fullisorune(buf+i, tot-i)) | 
|  | break; | 
|  | c = isochartorune(&runes[j], buf+i); | 
|  | if(runes[j] == Runeerror && c == 1){ | 
|  | if(squawk) | 
|  | EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); | 
|  | if(clean){ | 
|  | i++; | 
|  | continue; | 
|  | } | 
|  | nerrors++; | 
|  | } | 
|  | j++; | 
|  | i += c; | 
|  | } | 
|  | OUT(out, runes, j); | 
|  | tot -= i; | 
|  | ninput += i; | 
|  | if(tot) | 
|  | memmove(buf, buf+i, tot); | 
|  | if(n == 0) | 
|  | break; | 
|  | } | 
|  | OUT(out, runes, 0); | 
|  | } | 
|  |  | 
|  | void | 
|  | isoutf_out(Rune *base, int n, long *notused) | 
|  | { | 
|  | char *p; | 
|  | Rune *r; | 
|  |  | 
|  | USED(notused); | 
|  | nrunes += n; | 
|  | for(r = base, p = obuf; n-- > 0; r++) | 
|  | p += runetoisoutf(p, r); | 
|  | noutput += p-obuf; | 
|  | write(1, obuf, p-obuf); | 
|  | } | 
|  |  | 
|  |  | 
|  | int | 
|  | isochartorune(Rune *rune, char *str) | 
|  | { | 
|  | return chartorune(rune, str); | 
|  | } | 
|  |  | 
|  | int | 
|  | runetoisoutf(char *str, Rune *rune) | 
|  | { | 
|  | return runetochar(str, rune); | 
|  | } | 
|  |  | 
|  | int | 
|  | fullisorune(char *str, int n) | 
|  | { | 
|  | return fullrune(str, n); | 
|  | } | 
|  |  | 
|  | enum | 
|  | { | 
|  | T1	= 0x00, | 
|  | Tx	= 0x80, | 
|  | T2	= 0xC0, | 
|  | T3	= 0xE0, | 
|  | T4	= 0xF0, | 
|  | T5	= 0xF8, | 
|  | T6	= 0xFC, | 
|  |  | 
|  | Bit1	= 7, | 
|  | Bitx	= 6, | 
|  | Bit2	= 5, | 
|  | Bit3	= 4, | 
|  | Bit4	= 3, | 
|  | Bit5	= 2, | 
|  | Bit6	= 2, | 
|  |  | 
|  | Mask1	= (1<<Bit1)-1, | 
|  | Maskx	= (1<<Bitx)-1, | 
|  | Mask2	= (1<<Bit2)-1, | 
|  | Mask3	= (1<<Bit3)-1, | 
|  | Mask4	= (1<<Bit4)-1, | 
|  | Mask5	= (1<<Bit5)-1, | 
|  | Mask6	= (1<<Bit6)-1, | 
|  |  | 
|  | Wchar1	= (1UL<<Bit1)-1, | 
|  | Wchar2	= (1UL<<(Bit2+Bitx))-1, | 
|  | Wchar3	= (1UL<<(Bit3+2*Bitx))-1, | 
|  | Wchar4	= (1UL<<(Bit4+3*Bitx))-1, | 
|  | Wchar5	= (1UL<<(Bit5+4*Bitx))-1 | 
|  | }; | 
|  |  | 
|  | int | 
|  | our_wctomb(char *s, unsigned long wc) | 
|  | { | 
|  | if(s == 0) | 
|  | return 0;		/* no shift states */ | 
|  | if(wc & ~Wchar2) { | 
|  | if(wc & ~Wchar4) { | 
|  | if(wc & ~Wchar5) { | 
|  | /* 6 bytes */ | 
|  | s[0] = T6 | ((wc >> 5*Bitx) & Mask6); | 
|  | s[1] = Tx | ((wc >> 4*Bitx) & Maskx); | 
|  | s[2] = Tx | ((wc >> 3*Bitx) & Maskx); | 
|  | s[3] = Tx | ((wc >> 2*Bitx) & Maskx); | 
|  | s[4] = Tx | ((wc >> 1*Bitx) & Maskx); | 
|  | s[5] = Tx |  (wc & Maskx); | 
|  | return 6; | 
|  | } | 
|  | /* 5 bytes */ | 
|  | s[0] = T5 |  (wc >> 4*Bitx); | 
|  | s[1] = Tx | ((wc >> 3*Bitx) & Maskx); | 
|  | s[2] = Tx | ((wc >> 2*Bitx) & Maskx); | 
|  | s[3] = Tx | ((wc >> 1*Bitx) & Maskx); | 
|  | s[4] = Tx |  (wc & Maskx); | 
|  | return 5; | 
|  | } | 
|  | if(wc & ~Wchar3) { | 
|  | /* 4 bytes */ | 
|  | s[0] = T4 |  (wc >> 3*Bitx); | 
|  | s[1] = Tx | ((wc >> 2*Bitx) & Maskx); | 
|  | s[2] = Tx | ((wc >> 1*Bitx) & Maskx); | 
|  | s[3] = Tx |  (wc & Maskx); | 
|  | return 4; | 
|  | } | 
|  | /* 3 bytes */ | 
|  | s[0] = T3 |  (wc >> 2*Bitx); | 
|  | s[1] = Tx | ((wc >> 1*Bitx) & Maskx); | 
|  | s[2] = Tx |  (wc & Maskx); | 
|  | return 3; | 
|  | } | 
|  | if(wc & ~Wchar1) { | 
|  | /* 2 bytes */ | 
|  | s[0] = T2 | (wc >> 1*Bitx); | 
|  | s[1] = Tx | (wc & Maskx); | 
|  | return 2; | 
|  | } | 
|  | /* 1 byte */ | 
|  | s[0] = T1 | wc; | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | int | 
|  | our_mbtowc(unsigned long *p, char *s, unsigned n) | 
|  | { | 
|  | uchar *us; | 
|  | int c0, c1, c2, c3, c4, c5; | 
|  | unsigned long wc; | 
|  |  | 
|  | if(s == 0) | 
|  | return 0;		/* no shift states */ | 
|  |  | 
|  | if(n < 1) | 
|  | goto bad; | 
|  | us = (uchar*)s; | 
|  | c0 = us[0]; | 
|  | if(c0 >= T3) { | 
|  | if(n < 3) | 
|  | goto bad; | 
|  | c1 = us[1] ^ Tx; | 
|  | c2 = us[2] ^ Tx; | 
|  | if((c1|c2) & T2) | 
|  | goto bad; | 
|  | if(c0 >= T5) { | 
|  | if(n < 5) | 
|  | goto bad; | 
|  | c3 = us[3] ^ Tx; | 
|  | c4 = us[4] ^ Tx; | 
|  | if((c3|c4) & T2) | 
|  | goto bad; | 
|  | if(c0 >= T6) { | 
|  | /* 6 bytes */ | 
|  | if(n < 6) | 
|  | goto bad; | 
|  | c5 = us[5] ^ Tx; | 
|  | if(c5 & T2) | 
|  | goto bad; | 
|  | wc = ((((((((((c0 & Mask6) << Bitx) | | 
|  | c1) << Bitx) | c2) << Bitx) | | 
|  | c3) << Bitx) | c4) << Bitx) | c5; | 
|  | if(wc <= Wchar5) | 
|  | goto bad; | 
|  | *p = wc; | 
|  | return 6; | 
|  | } | 
|  | /* 5 bytes */ | 
|  | wc = ((((((((c0 & Mask5) << Bitx) | | 
|  | c1) << Bitx) | c2) << Bitx) | | 
|  | c3) << Bitx) | c4; | 
|  | if(wc <= Wchar4) | 
|  | goto bad; | 
|  | *p = wc; | 
|  | return 5; | 
|  | } | 
|  | if(c0 >= T4) { | 
|  | /* 4 bytes */ | 
|  | if(n < 4) | 
|  | goto bad; | 
|  | c3 = us[3] ^ Tx; | 
|  | if(c3 & T2) | 
|  | goto bad; | 
|  | wc = ((((((c0 & Mask4) << Bitx) | | 
|  | c1) << Bitx) | c2) << Bitx) | | 
|  | c3; | 
|  | if(wc <= Wchar3) | 
|  | goto bad; | 
|  | *p = wc; | 
|  | return 4; | 
|  | } | 
|  | /* 3 bytes */ | 
|  | wc = ((((c0 & Mask3) << Bitx) | | 
|  | c1) << Bitx) | c2; | 
|  | if(wc <= Wchar2) | 
|  | goto bad; | 
|  | *p = wc; | 
|  | return 3; | 
|  | } | 
|  | if(c0 >= T2) { | 
|  | /* 2 bytes */ | 
|  | if(n < 2) | 
|  | goto bad; | 
|  | c1 = us[1] ^ Tx; | 
|  | if(c1 & T2) | 
|  | goto bad; | 
|  | wc = ((c0 & Mask2) << Bitx) | | 
|  | c1; | 
|  | if(wc <= Wchar1) | 
|  | goto bad; | 
|  | *p = wc; | 
|  | return 2; | 
|  | } | 
|  | /* 1 byte */ | 
|  | if(c0 >= Tx) | 
|  | goto bad; | 
|  | *p = c0; | 
|  | return 1; | 
|  |  | 
|  | bad: | 
|  | errno = EILSEQ; | 
|  | return -1; | 
|  | } |