blob: 12127b584b90b69376b19178fcfee326d0e0a5f1 [file] [log] [blame]
#include <u.h>
#include <libc.h>
#include <draw.h>
#include <ctype.h>
#include <html.h>
#include "impl.h"
typedef struct TokenSource TokenSource;
struct TokenSource
{
int i; /* index of next byte to use */
uchar* data; /* all the data */
int edata; /* data[0:edata] is valid */
int chset; /* one of US_Ascii, etc. */
int mtype; /* TextHtml or TextPlain */
};
enum {
EOF = -2,
EOB = -1
};
#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
#define SMALLBUFSIZE 240
#define BIGBUFSIZE 2000
/* HTML 4.0 tag names. */
/* Keep sorted, and in correspondence with enum in iparse.h. */
Rune **tagnames;
char *_tagnames[] = {
" ",
"!",
"a",
"abbr",
"acronym",
"address",
"applet",
"area",
"b",
"base",
"basefont",
"bdo",
"big",
"blink",
"blockquote",
"body",
"bq",
"br",
"button",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"dd",
"del",
"dfn",
"dir",
"div",
"dl",
"dt",
"em",
"fieldset",
"font",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"hr",
"html",
"i",
"iframe",
"img",
"input",
"ins",
"isindex",
"kbd",
"label",
"legend",
"li",
"link",
"map",
"menu",
"meta",
"nobr",
"noframes",
"noscript",
"object",
"ol",
"optgroup",
"option",
"p",
"param",
"pre",
"q",
"s",
"samp",
"script",
"select",
"small",
"span",
"strike",
"strong",
"style",
"sub",
"sup",
"table",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"title",
"tr",
"tt",
"u",
"ul",
"var"
};
/* HTML 4.0 attribute names. */
/* Keep sorted, and in correspondence with enum in i.h. */
Rune **attrnames;
char* _attrnames[] = {
"abbr",
"accept-charset",
"access-key",
"action",
"align",
"alink",
"alt",
"archive",
"axis",
"background",
"bgcolor",
"border",
"cellpadding",
"cellspacing",
"char",
"charoff",
"charset",
"checked",
"cite",
"class",
"classid",
"clear",
"code",
"codebase",
"codetype",
"color",
"cols",
"colspan",
"compact",
"content",
"coords",
"data",
"datetime",
"declare",
"defer",
"dir",
"disabled",
"enctype",
"face",
"for",
"frame",
"frameborder",
"headers",
"height",
"href",
"hreflang",
"hspace",
"http-equiv",
"id",
"ismap",
"label",
"lang",
"link",
"longdesc",
"marginheight",
"marginwidth",
"maxlength",
"media",
"method",
"multiple",
"name",
"nohref",
"noresize",
"noshade",
"nowrap",
"object",
"onblur",
"onchange",
"onclick",
"ondblclick",
"onfocus",
"onkeypress",
"onkeyup",
"onload",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onreset",
"onselect",
"onsubmit",
"onunload",
"profile",
"prompt",
"readonly",
"rel",
"rev",
"rows",
"rowspan",
"rules",
"scheme",
"scope",
"scrolling",
"selected",
"shape",
"size",
"span",
"src",
"standby",
"start",
"style",
"summary",
"tabindex",
"target",
"text",
"title",
"type",
"usemap",
"valign",
"value",
"valuetype",
"version",
"vlink",
"vspace",
"width"
};
/* Character entity to unicode character number map. */
/* Keep sorted by name. */
StringInt *chartab;
AsciiInt _chartab[] = {
{"AElig", 198},
{"Aacute", 193},
{"Acirc", 194},
{"Agrave", 192},
{"Aring", 197},
{"Atilde", 195},
{"Auml", 196},
{"Ccedil", 199},
{"ETH", 208},
{"Eacute", 201},
{"Ecirc", 202},
{"Egrave", 200},
{"Euml", 203},
{"Iacute", 205},
{"Icirc", 206},
{"Igrave", 204},
{"Iuml", 207},
{"Ntilde", 209},
{"Oacute", 211},
{"Ocirc", 212},
{"Ograve", 210},
{"Oslash", 216},
{"Otilde", 213},
{"Ouml", 214},
{"THORN", 222},
{"Uacute", 218},
{"Ucirc", 219},
{"Ugrave", 217},
{"Uuml", 220},
{"Yacute", 221},
{"aacute", 225},
{"acirc", 226},
{"acute", 180},
{"aelig", 230},
{"agrave", 224},
{"alpha", 945},
{"amp", 38},
{"aring", 229},
{"atilde", 227},
{"auml", 228},
{"beta", 946},
{"brvbar", 166},
{"ccedil", 231},
{"cdots", 8943},
{"cedil", 184},
{"cent", 162},
{"chi", 967},
{"copy", 169},
{"curren", 164},
{"ddots", 8945},
{"deg", 176},
{"delta", 948},
{"divide", 247},
{"eacute", 233},
{"ecirc", 234},
{"egrave", 232},
{"emdash", 8212}, /* non-standard but commonly used */
{"emsp", 8195},
{"endash", 8211}, /* non-standard but commonly used */
{"ensp", 8194},
{"epsilon", 949},
{"eta", 951},
{"eth", 240},
{"euml", 235},
{"frac12", 189},
{"frac14", 188},
{"frac34", 190},
{"gamma", 947},
{"gt", 62},
{"iacute", 237},
{"icirc", 238},
{"iexcl", 161},
{"igrave", 236},
{"iota", 953},
{"iquest", 191},
{"iuml", 239},
{"kappa", 954},
{"lambda", 955},
{"laquo", 171},
{"ldquo", 8220},
{"ldots", 8230},
{"lsquo", 8216},
{"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
{"middot", 183},
{"mu", 956},
{"nbsp", 160},
{"ndash", 8211},
{"not", 172},
{"ntilde", 241},
{"nu", 957},
{"oacute", 243},
{"ocirc", 244},
{"ograve", 242},
{"omega", 969},
{"omicron", 959},
{"ordf", 170},
{"ordm", 186},
{"oslash", 248},
{"otilde", 245},
{"ouml", 246},
{"para", 182},
{"phi", 966},
{"pi", 960},
{"plusmn", 177},
{"pound", 163},
{"psi", 968},
{"quad", 8193},
{"quot", 34},
{"raquo", 187},
{"rdquo", 8221},
{"reg", 174},
{"rho", 961},
{"rsquo", 8217},
{"sect", 167},
{"shy", 173},
{"sigma", 963},
{"sp", 8194},
{"sup1", 185},
{"sup2", 178},
{"sup3", 179},
{"szlig", 223},
{"tau", 964},
{"theta", 952},
{"thinsp", 8201},
{"thorn", 254},
{"times", 215},
{"trade", 8482},
{"uacute", 250},
{"ucirc", 251},
{"ugrave", 249},
{"uml", 168},
{"upsilon", 965},
{"uuml", 252},
{"varepsilon", 8712},
{"varphi", 981},
{"varpi", 982},
{"varrho", 1009},
{"vdots", 8942},
{"vsigma", 962},
{"vtheta", 977},
{"xi", 958},
{"yacute", 253},
{"yen", 165},
{"yuml", 255},
{"zeta", 950}
};
#define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
/* Characters Winstart..Winend are those that Windows */
/* uses interpolated into the Latin1 set. */
/* They aren't supposed to appear in HTML, but they do.... */
enum {
Winstart = 127,
Winend = 159
};
static int winchars[]= { 8226, /* 8226 is a bullet */
8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
710, 8240, 352, 8249, 338, 8226, 8226, 8226,
8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
732, 8482, 353, 8250, 339, 8226, 8226, 376};
static StringInt* tagtable; /* initialized from tagnames */
static StringInt* attrtable; /* initialized from attrnames */
static void lexinit(void);
static int getplaindata(TokenSource* ts, Token* a, int* pai);
static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
static Rune* buftostr(Rune* s, Rune* buf, int j);
static int comment(TokenSource* ts);
static int findstr(TokenSource* ts, Rune* s);
static int ampersand(TokenSource* ts);
/*static int lowerc(int c); */
static int getchar(TokenSource* ts);
static void ungetchar(TokenSource* ts, int c);
static void backup(TokenSource* ts, int savei);
/*static void freeinsidetoken(Token* t); */
static void freeattrs(Attr* ahead);
static Attr* newattr(int attid, Rune* value, Attr* link);
static int Tconv(Fmt* f);
int dbglex = 0;
static int lexinited = 0;
static void
lexinit(void)
{
chartab = _cvtstringinttab(_chartab, nelem(_chartab));
tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
tagtable = _makestrinttab(tagnames, Numtags);
attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
attrtable = _makestrinttab(attrnames, Numattrs);
fmtinstall('T', Tconv);
lexinited = 1;
}
static TokenSource*
newtokensource(uchar* data, int edata, int chset, int mtype)
{
TokenSource* ans;
assert(chset == US_Ascii || chset == ISO_8859_1 ||
chset == UTF_8 || chset == Unicode);
ans = (TokenSource*)emalloc(sizeof(TokenSource));
ans->i = 0;
ans->data = data;
ans->edata = edata;
ans->chset = chset;
ans->mtype = mtype;
return ans;
}
enum {
ToksChunk = 500
};
/* Call this to get the tokens. */
/* The number of returned tokens is returned in *plen. */
Token*
_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
{
TokenSource* ts;
Token* a;
int alen;
int ai;
int starti;
int c;
int tag;
if(!lexinited)
lexinit();
ts = newtokensource(data, datalen, chset, mtype);
alen = ToksChunk;
a = (Token*)emalloc(alen * sizeof(Token));
ai = 0;
if(dbglex)
fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
if(ts->mtype == TextHtml){
for(;;){
if(ai == alen){
a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
alen += ToksChunk;
}
starti = ts->i;
c = getchar(ts);
if(c < 0)
break;
if(c == '<'){
tag = gettag(ts, starti, a, &ai);
if(tag == Tscript){
/* special rules for getting Data after.... */
starti = ts->i;
c = getchar(ts);
tag = getscriptdata(ts, c, starti, a, &ai);
}
}
else
tag = getdata(ts, c, starti, a, &ai);
if(tag == -1)
break;
else if(dbglex > 1 && tag != Comment)
fprint(2, "lex: got token %T\n", &a[ai-1]);
}
}
else {
/* plain text (non-html) tokens */
for(;;){
if(ai == alen){
a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
alen += ToksChunk;
}
tag = getplaindata(ts, a, &ai);
if(tag == -1)
break;
if(dbglex > 1)
fprint(2, "lex: got token %T\n", &a[ai]);
}
}
if(dbglex)
fprint(2, "lex: returning %d tokens\n", ai);
*plen = ai;
if(ai == 0)
return nil;
return a;
}
/* For case where source isn't HTML. */
/* Just make data tokens, one per line (or partial line, */
/* at end of buffer), ignoring non-whitespace control */
/* characters and dumping \r's. */
/* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
/* Otherwise return -1; */
static int
getplaindata(TokenSource* ts, Token* a, int* pai)
{
Rune* s;
int j;
int starti;
int c;
Token* tok;
Rune buf[BIGBUFSIZE];
s = nil;
j = 0;
starti = ts->i;
for(c = getchar(ts); c >= 0; c = getchar(ts)){
if(c < ' '){
if(isspace(c)){
if(c == '\r'){
/* ignore it unless no following '\n', */
/* in which case treat it like '\n' */
c = getchar(ts);
if(c != '\n'){
if(c >= 0)
ungetchar(ts, c);
c = '\n';
}
}
}
else
c = 0;
}
if(c != 0){
buf[j++] = c;
if(j == sizeof(buf)-1){
s = buftostr(s, buf, j);
j = 0;
}
}
if(c == '\n')
break;
}
s = buftostr(s, buf, j);
if(s == nil)
return -1;
tok = &a[(*pai)++];
tok->tag = Data;
tok->text = s;
tok->attr = nil;
tok->starti = starti;
return Data;
}
/* Return concatenation of s and buf[0:j] */
static Rune*
buftostr(Rune* s, Rune* buf, int j)
{
buf[j] = 0;
if(s == nil)
s = _Strndup(buf, j);
else
s = _Strdup2(s, buf);
return s;
}
/* Gather data up to next start-of-tag or end-of-buffer. */
/* Translate entity references (&amp;). */
/* Ignore non-whitespace control characters and get rid of \r's. */
/* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
/* Otherwise return -1; */
static int
getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
{
Rune* s;
int j;
int c;
Token* tok;
Rune buf[BIGBUFSIZE];
s = nil;
j = 0;
c = firstc;
while(c >= 0){
if(c == '&'){
c = ampersand(ts);
if(c < 0)
break;
}
else if(c < ' '){
if(isspace(c)){
if(c == '\r'){
/* ignore it unless no following '\n', */
/* in which case treat it like '\n' */
c = getchar(ts);
if(c != '\n'){
if(c >= 0)
ungetchar(ts, c);
c = '\n';
}
}
}
else {
if(warn)
fprint(2, "warning: non-whitespace control character %d ignored\n", c);
c = 0;
}
}
else if(c == '<'){
ungetchar(ts, c);
break;
}
if(c != 0){
buf[j++] = c;
if(j == BIGBUFSIZE-1){
s = buftostr(s, buf, j);
j = 0;
}
}
c = getchar(ts);
}
s = buftostr(s, buf, j);
if(s == nil)
return -1;
tok = &a[(*pai)++];
tok->tag = Data;
tok->text = s;
tok->attr = nil;
tok->starti = starti;
return Data;
}
/* The rules for lexing scripts are different (ugh). */
/* Gather up everything until see a </SCRIPT>. */
static int
getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
{
Rune* s;
int j;
int tstarti;
int savei;
int c;
int tag;
int done;
Token* tok;
Rune buf[BIGBUFSIZE];
s = nil;
j = 0;
tstarti = starti;
c = firstc;
done = 0;
while(c >= 0){
if(c == '<'){
/* other browsers ignore stuff to end of line after <! */
savei = ts->i;
c = getchar(ts);
if(c == '!'){
while(c >= 0 && c != '\n' && c != '\r')
c = getchar(ts);
if(c == '\r')
c = getchar(ts);
if(c == '\n')
c = getchar(ts);
}
else if(c >= 0){
backup(ts, savei);
tag = gettag(ts, tstarti, a, pai);
if(tag == -1)
break;
if(tag != Comment)
(*pai)--;
backup(ts, tstarti);
if(tag == Tscript + RBRA){
done = 1;
break;
}
/* here tag was not </SCRIPT>, so take as regular data */
c = getchar(ts);
}
}
if(c < 0)
break;
if(c != 0){
buf[j++] = c;
if(j == BIGBUFSIZE-1){
s = buftostr(s, buf, j);
j = 0;
}
}
tstarti = ts->i;
c = getchar(ts);
}
if(done || ts->i == ts->edata){
s = buftostr(s, buf, j);
tok = &a[(*pai)++];
tok->tag = Data;
tok->text = s;
tok->attr = nil;
tok->starti = starti;
return Data;
}
backup(ts, starti);
return -1;
}
/* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */
/* ends before then, return -1). */
/* If it's a tag, look up the name, gather the attributes, and return */
/* the appropriate token. */
/* Else it's either just plain data or some kind of ignorable stuff: */
/* return Data or Comment as appropriate. */
/* If it's not a Comment, put it in a[*pai] and bump *pai. */
static int
gettag(TokenSource* ts, int starti, Token* a, int* pai)
{
int rbra;
int ans;
Attr* al;
int nexti;
int c;
int ti;
int afnd;
int attid;
int quote;
Rune* val;
int nv;
int i;
int tag;
Token* tok;
Rune buf[BIGBUFSIZE];
rbra = 0;
nexti = ts->i;
tok = &a[*pai];
tok->tag = Notfound;
tok->text = nil;
tok->attr = nil;
tok->starti = starti;
c = getchar(ts);
if(c == '/'){
rbra = RBRA;
c = getchar(ts);
}
if(c < 0)
goto eob_done;
if(c >= 256 || !isalpha(c)){
/* not a tag */
if(c == '!'){
ans = comment(ts);
if(ans != -1)
return ans;
goto eob_done;
}
else {
backup(ts, nexti);
tok->tag = Data;
tok->text = _Strdup(L(Llt));
(*pai)++;
return Data;
}
}
/* c starts a tagname */
buf[0] = c;
i = 1;
for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(!ISNAMCHAR(c))
break;
/* if name is bigger than buf it won't be found anyway... */
if(i < BIGBUFSIZE)
buf[i++] = c;
}
if(_lookup(tagtable, Numtags, buf, i, &tag))
tok->tag = tag + rbra;
else
tok->text = _Strndup(buf, i); /* for warning print, in build */
/* attribute gathering loop */
al = nil;
for(;;){
/* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */
/* skip whitespace */
attrloop_continue:
while(c < 256 && isspace(c)){
c = getchar(ts);
if(c < 0)
goto eob_done;
}
if(c == '>')
goto attrloop_done;
if(c == '<'){
if(warn)
fprint(2, "warning: unclosed tag\n");
ungetchar(ts, c);
goto attrloop_done;
}
if(c >= 256 || !isalpha(c)){
if(warn)
fprint(2, "warning: expected attribute name\n");
/* skipt to next attribute name */
for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(c < 256 && isalpha(c))
goto attrloop_continue;
if(c == '<'){
if(warn)
fprint(2, "warning: unclosed tag\n");
ungetchar(ts, 60);
goto attrloop_done;
}
if(c == '>')
goto attrloop_done;
}
}
/* gather attribute name */
buf[0] = c;
i = 1;
for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(!ISNAMCHAR(c))
break;
if(i < BIGBUFSIZE-1)
buf[i++] = c;
}
afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
if(warn && !afnd){
buf[i] = 0;
fprint(2, "warning: unknown attribute name %S\n", buf);
}
/* skip whitespace */
while(c < 256 && isspace(c)){
c = getchar(ts);
if(c < 0)
goto eob_done;
}
if(c != '='){
if(afnd)
al = newattr(attid, nil, al);
goto attrloop_continue;
}
/*# c is '=' here; skip whitespace */
for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(c >= 256 || !isspace(c))
break;
}
quote = 0;
if(c == '\'' || c == '"'){
quote = c;
c = getchar(ts);
if(c < 0)
goto eob_done;
}
val = nil;
nv = 0;
for(;;){
valloop_continue:
if(c < 0)
goto eob_done;
if(c == '>'){
if(quote){
/* c might be part of string (though not good style) */
/* but if line ends before close quote, assume */
/* there was an unmatched quote */
ti = ts->i;
for(;;){
c = getchar(ts);
if(c < 0)
goto eob_done;
if(c == quote){
backup(ts, ti);
buf[nv++] = '>';
if(nv == BIGBUFSIZE-1){
val = buftostr(val, buf, nv);
nv = 0;
}
c = getchar(ts);
goto valloop_continue;
}
if(c == '\n'){
if(warn)
fprint(2, "warning: apparent unmatched quote\n");
backup(ts, ti);
c = '>';
goto valloop_done;
}
}
}
else
goto valloop_done;
}
if(quote){
if(c == quote){
c = getchar(ts);
if(c < 0)
goto eob_done;
goto valloop_done;
}
if(c == '\r'){
c = getchar(ts);
goto valloop_continue;
}
if(c == '\t' || c == '\n')
c = ' ';
}
else {
if(c < 256 && isspace(c))
goto valloop_done;
}
if(c == '&'){
c = ampersand(ts);
if(c == -1)
goto eob_done;
}
buf[nv++] = c;
if(nv == BIGBUFSIZE-1){
val = buftostr(val, buf, nv);
nv = 0;
}
c = getchar(ts);
}
valloop_done:
if(afnd){
val = buftostr(val, buf, nv);
al = newattr(attid, val, al);
}
}
attrloop_done:
tok->attr = al;
(*pai)++;
return tok->tag;
eob_done:
if(warn)
fprint(2, "warning: incomplete tag at end of page\n");
backup(ts, nexti);
tok->tag = Data;
tok->text = _Strdup(L(Llt));
return Data;
}
/* We've just read a '<!' at position starti, */
/* so this may be a comment or other ignored section, or it may */
/* be just a literal string if there is no close before end of file */
/* (other browsers do that). */
/* The accepted practice seems to be (note: contrary to SGML spec!): */
/* If see <!--, look for --> to close, or if none, > to close. */
/* If see <!(not --), look for > to close. */
/* If no close before end of file, leave original characters in as literal data. */
/* */
/* If we see ignorable stuff, return Comment. */
/* Else return nil (caller should back up and try again when more data arrives, */
/* unless at end of file, in which case caller should just make '<' a data token). */
static int
comment(TokenSource* ts)
{
int nexti;
int havecomment;
int c;
nexti = ts->i;
havecomment = 0;
c = getchar(ts);
if(c == '-'){
c = getchar(ts);
if(c == '-'){
if(findstr(ts, L(Larrow)))
havecomment = 1;
else
backup(ts, nexti);
}
}
if(!havecomment){
if(c == '>')
havecomment = 1;
else if(c >= 0){
if(findstr(ts, L(Lgt)))
havecomment = 1;
}
}
if(havecomment)
return Comment;
return -1;
}
/* Look for string s in token source. */
/* If found, return 1, with buffer at next char after s, */
/* else return 0 (caller should back up). */
static int
findstr(TokenSource* ts, Rune* s)
{
int c0;
int n;
int nexti;
int i;
int c;
c0 = s[0];
n = runestrlen(s);
for(;;){
c = getchar(ts);
if(c < 0)
break;
if(c == c0){
if(n == 1)
return 1;
nexti = ts->i;
for(i = 1; i < n; i++){
c = getchar(ts);
if(c < 0)
goto mainloop_done;
if(c != s[i])
break;
}
if(i == n)
return 1;
backup(ts, nexti);
}
}
mainloop_done:
return 0;
}
static int
xdigit(int c)
{
if('0' <= c && c <= '9')
return c-'0';
if('a' <= c && c <= 'f')
return c-'a'+10;
if('A' <= c && c <= 'F')
return c-'A'+10;
return -1;
}
/* We've just read an '&'; look for an entity reference */
/* name, and if found, return translated char. */
/* if there is a complete entity name but it isn't known, */
/* try prefixes (gets around some buggy HTML out there), */
/* and if that fails, back up to just past the '&' and return '&'. */
/* If the entity can't be completed in the current buffer, back up */
/* to the '&' and return -1. */
static int
ampersand(TokenSource* ts)
{
int savei;
int c;
int fnd;
int ans;
int v;
int i;
int k;
Rune buf[SMALLBUFSIZE];
savei = ts->i;
c = getchar(ts);
fnd = 0;
ans = -1;
if(c == '#'){
c = getchar(ts);
v = 0;
if(c == 'x'){
c = getchar(ts);
while((i=xdigit(c)) != -1){
v = v*16 + i;
c = getchar(ts);
}
}else{
while('0' <= c && c <= '9'){
v = v*10 + c - '0';
c = getchar(ts);
}
}
if(c >= 0){
if(!(c == ';' || c == '\n' || c == '\r'))
ungetchar(ts, c);
c = v;
if(c == 160)
c = 160;
if(c >= Winstart && c <= Winend){
c = winchars[c - Winstart];
}
ans = c;
fnd = 1;
}
}
else if(c < 256 && isalpha(c)){
buf[0] = c;
k = 1;
for(;;){
c = getchar(ts);
if(c < 0)
break;
if(ISNAMCHAR(c)){
if(k < SMALLBUFSIZE-1)
buf[k++] = c;
}
else {
if(!(c == ';' || c == '\n' || c == '\r'))
ungetchar(ts, c);
break;
}
}
if(c >= 0){
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
if(!fnd){
/* Try prefixes of s */
if(c == ';' || c == '\n' || c == '\r')
ungetchar(ts, c);
i = k;
while(--k > 0){
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
if(fnd){
while(i > k){
i--;
ungetchar(ts, buf[i]);
}
break;
}
}
}
}
}
if(!fnd){
backup(ts, savei);
ans = '&';
}
return ans;
}
/* Get next char, obeying ts.chset. */
/* Returns -1 if no complete character left before current end of data. */
static int
getchar(TokenSource* ts)
{
uchar* buf;
int c;
int n;
int ok;
Rune r;
if(ts->i >= ts->edata)
return -1;
buf = ts->data;
c = buf[ts->i];
switch(ts->chset){
case ISO_8859_1:
if(c >= Winstart && c <= Winend)
c = winchars[c - Winstart];
ts->i++;
break;
case US_Ascii:
if(c > 127){
if(warn)
fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
}
ts->i++;
break;
case UTF_8:
ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
n = chartorune(&r, (char*)(buf+ts->i));
if(ok){
if(warn && c == 0x80)
fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
ts->i += n;
c = r;
}
else {
/* not enough bytes in buf to complete utf-8 char */
ts->i = ts->edata; /* mark "all used" */
c = -1;
}
break;
case Unicode:
if(ts->i < ts->edata - 1){
/*standards say most-significant byte first */
c = (c << 8)|(buf[ts->i + 1]);
ts->i += 2;
}
else {
ts->i = ts->edata; /* mark "all used" */
c = -1;
}
break;
}
return c;
}
/* Assuming c was the last character returned by getchar, set */
/* things up so that next getchar will get that same character */
/* followed by the current 'next character', etc. */
static void
ungetchar(TokenSource* ts, int c)
{
int n;
Rune r;
char a[UTFmax];
n = 1;
switch(ts->chset){
case UTF_8:
if(c >= 128){
r = c;
n = runetochar(a, &r);
}
break;
case Unicode:
n = 2;
break;
}
ts->i -= n;
}
/* Restore ts so that it is at the state where the index was savei. */
static void
backup(TokenSource* ts, int savei)
{
if(dbglex)
fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
ts->i = savei;
}
/* Look for value associated with attribute attid in token t. */
/* If there is one, return 1 and put the value in *pans, */
/* else return 0. */
/* If xfer is true, transfer ownership of the string to the caller */
/* (nil it out here); otherwise, caller must duplicate the answer */
/* if it needs to save it. */
/* OK to have pans==0, in which case this is just looking */
/* to see if token is present. */
int
_tokaval(Token* t, int attid, Rune** pans, int xfer)
{
Attr* attr;
attr = t->attr;
while(attr != nil){
if(attr->attid == attid){
if(pans != nil)
*pans = attr->value;
if(xfer)
attr->value = nil;
return 1;
}
attr = attr->next;
}
if(pans != nil)
*pans = nil;
return 0;
}
static int
Tconv(Fmt *f)
{
Token* t;
int i;
int tag;
char* srbra;
Rune* aname;
Rune* tname;
Attr* a;
char buf[BIGBUFSIZE];
t = va_arg(f->args, Token*);
if(t == nil)
sprint(buf, "<null>");
else {
i = 0;
if(dbglex > 1)
i = snprint(buf, sizeof(buf), "[%d]", t->starti);
tag = t->tag;
if(tag == Data){
i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
}
else {
srbra = "";
if(tag >= RBRA){
tag -= RBRA;
srbra = "/";
}
tname = tagnames[tag];
if(tag == Notfound)
tname = L(Lquestion);
i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
for(a = t->attr; a != nil; a = a->next){
aname = attrnames[a->attid];
i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
if(a->value != nil)
i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
}
i += snprint(buf+i, sizeof(buf)-i-1, ">");
}
buf[i] = 0;
}
return fmtstrcpy(f, buf);
}
/* Attrs own their constituent strings, but build may eventually */
/* transfer some values to its items and nil them out in the Attr. */
static Attr*
newattr(int attid, Rune* value, Attr* link)
{
Attr* ans;
ans = (Attr*)emalloc(sizeof(Attr));
ans->attid = attid;
ans->value = value;
ans->next = link;
return ans;
}
/* Free list of Attrs linked through next field */
static void
freeattrs(Attr* ahead)
{
Attr* a;
Attr* nexta;
a = ahead;
while(a != nil){
nexta = a->next;
free(a->value);
free(a);
a = nexta;
}
}
/* Free array of Tokens. */
/* Allocated space might have room for more than n tokens, */
/* but only n of them are initialized. */
/* If caller has transferred ownership of constitutent strings */
/* or attributes, it must have nil'd out the pointers in the Tokens. */
void
_freetokens(Token* tarray, int n)
{
int i;
Token* t;
if(tarray == nil)
return;
for(i = 0; i < n; i++){
t = &tarray[i];
free(t->text);
freeattrs(t->attr);
}
free(tarray);
}