blob: 3970e596a99fe6046c695c69f680d651e158aa94 [file] [log] [blame]
#ifdef PLAN9
#include <u.h>
#include <libc.h>
#include <bio.h>
#ifdef PLAN9PORT
#include <errno.h>
#else
extern int errno;
#endif
#else
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include "plan9.h"
#endif
#include "hdr.h"
#ifndef EILSEQ
#define EILSEQ 9998
#endif
/*
the our_* routines are implementations for the corresponding library
routines. for a while, i tried to actually name them wctomb etc
but stopped that after i found a system which made wchar_t an
unsigned char.
*/
int our_wctomb(char *s, unsigned long wc);
int our_mbtowc(unsigned long *p, char *s, unsigned n);
int runetoisoutf(char *str, Rune *rune);
int fullisorune(char *str, int n);
int isochartorune(Rune *rune, char *str);
void
utf_in(int fd, long *notused, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
ulong l;
USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
c = our_mbtowc(&l, buf+i, tot-i);
if(c == -1){
if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean){
i++;
continue;
}
nerrors++;
l = Runeerror;
c = 1;
}
runes[j++] = l;
i += c;
}
OUT(out, runes, j);
tot -= i;
ninput += i;
if(tot)
memmove(buf, buf+i, tot);
if(n == 0)
break;
}
OUT(out, runes, 0);
}
void
utf_out(Rune *base, int n, long *notused)
{
char *p;
Rune *r;
USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++){
p += our_wctomb(p, *r);
}
noutput += p-obuf;
write(1, obuf, p-obuf);
}
void
isoutf_in(int fd, long *notused, struct convert *out)
{
char buf[N];
int i, j, c, n, tot;
USED(notused);
tot = 0;
while((n = read(fd, buf+tot, N-tot)) >= 0){
tot += n;
for(i=j=0; i<tot; ){
if(!fullisorune(buf+i, tot-i))
break;
c = isochartorune(&runes[j], buf+i);
if(runes[j] == Runeerror && c == 1){
if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean){
i++;
continue;
}
nerrors++;
}
j++;
i += c;
}
OUT(out, runes, j);
tot -= i;
ninput += i;
if(tot)
memmove(buf, buf+i, tot);
if(n == 0)
break;
}
OUT(out, runes, 0);
}
void
isoutf_out(Rune *base, int n, long *notused)
{
char *p;
Rune *r;
USED(notused);
nrunes += n;
for(r = base, p = obuf; n-- > 0; r++)
p += runetoisoutf(p, r);
noutput += p-obuf;
write(1, obuf, p-obuf);
}
int
isochartorune(Rune *rune, char *str)
{
return chartorune(rune, str);
}
int
runetoisoutf(char *str, Rune *rune)
{
return runetochar(str, rune);
}
int
fullisorune(char *str, int n)
{
return fullrune(str, n);
}
enum
{
T1 = 0x00,
Tx = 0x80,
T2 = 0xC0,
T3 = 0xE0,
T4 = 0xF0,
T5 = 0xF8,
T6 = 0xFC,
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
Bit6 = 2,
Mask1 = (1<<Bit1)-1,
Maskx = (1<<Bitx)-1,
Mask2 = (1<<Bit2)-1,
Mask3 = (1<<Bit3)-1,
Mask4 = (1<<Bit4)-1,
Mask5 = (1<<Bit5)-1,
Mask6 = (1<<Bit6)-1,
Wchar1 = (1UL<<Bit1)-1,
Wchar2 = (1UL<<(Bit2+Bitx))-1,
Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
Wchar5 = (1UL<<(Bit5+4*Bitx))-1
};
int
our_wctomb(char *s, unsigned long wc)
{
if(s == 0)
return 0; /* no shift states */
if(wc & ~Wchar2) {
if(wc & ~Wchar4) {
if(wc & ~Wchar5) {
/* 6 bytes */
s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
s[5] = Tx | (wc & Maskx);
return 6;
}
/* 5 bytes */
s[0] = T5 | (wc >> 4*Bitx);
s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
s[4] = Tx | (wc & Maskx);
return 5;
}
if(wc & ~Wchar3) {
/* 4 bytes */
s[0] = T4 | (wc >> 3*Bitx);
s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
s[3] = Tx | (wc & Maskx);
return 4;
}
/* 3 bytes */
s[0] = T3 | (wc >> 2*Bitx);
s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
s[2] = Tx | (wc & Maskx);
return 3;
}
if(wc & ~Wchar1) {
/* 2 bytes */
s[0] = T2 | (wc >> 1*Bitx);
s[1] = Tx | (wc & Maskx);
return 2;
}
/* 1 byte */
s[0] = T1 | wc;
return 1;
}
int
our_mbtowc(unsigned long *p, char *s, unsigned n)
{
uchar *us;
int c0, c1, c2, c3, c4, c5;
unsigned long wc;
if(s == 0)
return 0; /* no shift states */
if(n < 1)
goto bad;
us = (uchar*)s;
c0 = us[0];
if(c0 >= T3) {
if(n < 3)
goto bad;
c1 = us[1] ^ Tx;
c2 = us[2] ^ Tx;
if((c1|c2) & T2)
goto bad;
if(c0 >= T5) {
if(n < 5)
goto bad;
c3 = us[3] ^ Tx;
c4 = us[4] ^ Tx;
if((c3|c4) & T2)
goto bad;
if(c0 >= T6) {
/* 6 bytes */
if(n < 6)
goto bad;
c5 = us[5] ^ Tx;
if(c5 & T2)
goto bad;
wc = ((((((((((c0 & Mask6) << Bitx) |
c1) << Bitx) | c2) << Bitx) |
c3) << Bitx) | c4) << Bitx) | c5;
if(wc <= Wchar5)
goto bad;
*p = wc;
return 6;
}
/* 5 bytes */
wc = ((((((((c0 & Mask5) << Bitx) |
c1) << Bitx) | c2) << Bitx) |
c3) << Bitx) | c4;
if(wc <= Wchar4)
goto bad;
*p = wc;
return 5;
}
if(c0 >= T4) {
/* 4 bytes */
if(n < 4)
goto bad;
c3 = us[3] ^ Tx;
if(c3 & T2)
goto bad;
wc = ((((((c0 & Mask4) << Bitx) |
c1) << Bitx) | c2) << Bitx) |
c3;
if(wc <= Wchar3)
goto bad;
*p = wc;
return 4;
}
/* 3 bytes */
wc = ((((c0 & Mask3) << Bitx) |
c1) << Bitx) | c2;
if(wc <= Wchar2)
goto bad;
*p = wc;
return 3;
}
if(c0 >= T2) {
/* 2 bytes */
if(n < 2)
goto bad;
c1 = us[1] ^ Tx;
if(c1 & T2)
goto bad;
wc = ((c0 & Mask2) << Bitx) |
c1;
if(wc <= Wchar1)
goto bad;
*p = wc;
return 2;
}
/* 1 byte */
if(c0 >= Tx)
goto bad;
*p = c0;
return 1;
bad:
errno = EILSEQ;
return -1;
}