src/libhtml/lex.c - plan9 - Git at Google

 #include <u.h>
 #include <libc.h>
 #include <draw.h>
 #include <ctype.h>
 #include <html.h>
 #include "impl.h"

 typedef struct TokenSource TokenSource;
 struct TokenSource
 {
 	int			i;		/* index of next byte to use */
 	uchar*		data;		/* all the data */
 	int			edata;	/* data[0:edata] is valid */
 	int			chset;	/* one of US_Ascii, etc. */
 	int			mtype;	/* TextHtml or TextPlain */
 };

 enum {
 	EOF = -2,
 	EOB = -1
 };

 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))

 #define SMALLBUFSIZE 240
 #define BIGBUFSIZE 2000

 /* HTML 4.0 tag names. */
 /* Keep sorted, and in correspondence with enum in iparse.h. */
 Rune **tagnames;
 char *_tagnames[] = {
 	" ",
 	"!",
 	"a",
 	"abbr",
 	"acronym",
 	"address",
 	"applet",
 	"area",
 	"b",
 	"base",
 	"basefont",
 	"bdo",
 	"big",
 	"blink",
 	"blockquote",
 	"body",
 	"bq",
 	"br",
 	"button",
 	"caption",
 	"center",
 	"cite",
 	"code",
 	"col",
 	"colgroup",
 	"dd",
 	"del",
 	"dfn",
 	"dir",
 	"div",
 	"dl",
 	"dt",
 	"em",
 	"fieldset",
 	"font",
 	"form",
 	"frame",
 	"frameset",
 	"h1",
 	"h2",
 	"h3",
 	"h4",
 	"h5",
 	"h6",
 	"head",
 	"hr",
 	"html",
 	"i",
 	"iframe",
 	"img",
 	"input",
 	"ins",
 	"isindex",
 	"kbd",
 	"label",
 	"legend",
 	"li",
 	"link",
 	"map",
 	"menu",
 	"meta",
 	"nobr",
 	"noframes",
 	"noscript",
 	"object",
 	"ol",
 	"optgroup",
 	"option",
 	"p",
 	"param",
 	"pre",
 	"q",
 	"s",
 	"samp",
 	"script",
 	"select",
 	"small",
 	"span",
 	"strike",
 	"strong",
 	"style",
 	"sub",
 	"sup",
 	"table",
 	"tbody",
 	"td",
 	"textarea",
 	"tfoot",
 	"th",
 	"thead",
 	"title",
 	"tr",
 	"tt",
 	"u",
 	"ul",
 	"var"
 };

 /* HTML 4.0 attribute names. */
 /* Keep sorted, and in correspondence with enum in i.h. */
 Rune **attrnames;
 char* _attrnames[] = {
 	"abbr",
 	"accept-charset",
 	"access-key",
 	"action",
 	"align",
 	"alink",
 	"alt",
 	"archive",
 	"axis",
 	"background",
 	"bgcolor",
 	"border",
 	"cellpadding",
 	"cellspacing",
 	"char",
 	"charoff",
 	"charset",
 	"checked",
 	"cite",
 	"class",
 	"classid",
 	"clear",
 	"code",
 	"codebase",
 	"codetype",
 	"color",
 	"cols",
 	"colspan",
 	"compact",
 	"content",
 	"coords",
 	"data",
 	"datetime",
 	"declare",
 	"defer",
 	"dir",
 	"disabled",
 	"enctype",
 	"face",
 	"for",
 	"frame",
 	"frameborder",
 	"headers",
 	"height",
 	"href",
 	"hreflang",
 	"hspace",
 	"http-equiv",
 	"id",
 	"ismap",
 	"label",
 	"lang",
 	"link",
 	"longdesc",
 	"marginheight",
 	"marginwidth",
 	"maxlength",
 	"media",
 	"method",
 	"multiple",
 	"name",
 	"nohref",
 	"noresize",
 	"noshade",
 	"nowrap",
 	"object",
 	"onblur",
 	"onchange",
 	"onclick",
 	"ondblclick",
 	"onfocus",
 	"onkeypress",
 	"onkeyup",
 	"onload",
 	"onmousedown",
 	"onmousemove",
 	"onmouseout",
 	"onmouseover",
 	"onmouseup",
 	"onreset",
 	"onselect",
 	"onsubmit",
 	"onunload",
 	"profile",
 	"prompt",
 	"readonly",
 	"rel",
 	"rev",
 	"rows",
 	"rowspan",
 	"rules",
 	"scheme",
 	"scope",
 	"scrolling",
 	"selected",
 	"shape",
 	"size",
 	"span",
 	"src",
 	"standby",
 	"start",
 	"style",
 	"summary",
 	"tabindex",
 	"target",
 	"text",
 	"title",
 	"type",
 	"usemap",
 	"valign",
 	"value",
 	"valuetype",
 	"version",
 	"vlink",
 	"vspace",
 	"width"
 };


 /* Character entity to unicode character number map. */
 /* Keep sorted by name. */
 StringInt *chartab;
 AsciiInt _chartab[] = {
 	{"AElig", 198},
 	{"Aacute", 193},
 	{"Acirc", 194},
 	{"Agrave", 192},
 	{"Aring", 197},
 	{"Atilde", 195},
 	{"Auml", 196},
 	{"Ccedil", 199},
 	{"ETH", 208},
 	{"Eacute", 201},
 	{"Ecirc", 202},
 	{"Egrave", 200},
 	{"Euml", 203},
 	{"Iacute", 205},
 	{"Icirc", 206},
 	{"Igrave", 204},
 	{"Iuml", 207},
 	{"Ntilde", 209},
 	{"Oacute", 211},
 	{"Ocirc", 212},
 	{"Ograve", 210},
 	{"Oslash", 216},
 	{"Otilde", 213},
 	{"Ouml", 214},
 	{"THORN", 222},
 	{"Uacute", 218},
 	{"Ucirc", 219},
 	{"Ugrave", 217},
 	{"Uuml", 220},
 	{"Yacute", 221},
 	{"aacute", 225},
 	{"acirc", 226},
 	{"acute", 180},
 	{"aelig", 230},
 	{"agrave", 224},
 	{"alpha", 945},
 	{"amp", 38},
 	{"aring", 229},
 	{"atilde", 227},
 	{"auml", 228},
 	{"beta", 946},
 	{"brvbar", 166},
 	{"ccedil", 231},
 	{"cdots", 8943},
 	{"cedil", 184},
 	{"cent", 162},
 	{"chi", 967},
 	{"copy", 169},
 	{"curren", 164},
 	{"ddots", 8945},
 	{"deg", 176},
 	{"delta", 948},
 	{"divide", 247},
 	{"eacute", 233},
 	{"ecirc", 234},
 	{"egrave", 232},
 	{"emdash", 8212},	/* non-standard but commonly used */
 	{"emsp", 8195},
 	{"endash", 8211},	/* non-standard but commonly used */
 	{"ensp", 8194},
 	{"epsilon", 949},
 	{"eta", 951},
 	{"eth", 240},
 	{"euml", 235},
 	{"frac12", 189},
 	{"frac14", 188},
 	{"frac34", 190},
 	{"gamma", 947},
 	{"gt", 62},
 	{"iacute", 237},
 	{"icirc", 238},
 	{"iexcl", 161},
 	{"igrave", 236},
 	{"iota", 953},
 	{"iquest", 191},
 	{"iuml", 239},
 	{"kappa", 954},
 	{"lambda", 955},
 	{"laquo", 171},
 	{"ldquo", 8220},
 	{"ldots", 8230},
 	{"lsquo", 8216},
 	{"lt", 60},
 	{"macr", 175},
 	{"mdash", 8212},
 	{"micro", 181},
 	{"middot", 183},
 	{"mu", 956},
 	{"nbsp", 160},
 	{"ndash", 8211},
 	{"not", 172},
 	{"ntilde", 241},
 	{"nu", 957},
 	{"oacute", 243},
 	{"ocirc", 244},
 	{"ograve", 242},
 	{"omega", 969},
 	{"omicron", 959},
 	{"ordf", 170},
 	{"ordm", 186},
 	{"oslash", 248},
 	{"otilde", 245},
 	{"ouml", 246},
 	{"para", 182},
 	{"phi", 966},
 	{"pi", 960},
 	{"plusmn", 177},
 	{"pound", 163},
 	{"psi", 968},
 	{"quad", 8193},
 	{"quot", 34},
 	{"raquo", 187},
 	{"rdquo", 8221},
 	{"reg", 174},
 	{"rho", 961},
 	{"rsquo", 8217},
 	{"sect", 167},
 	{"shy", 173},
 	{"sigma", 963},
 	{"sp", 8194},
 	{"sup1", 185},
 	{"sup2", 178},
 	{"sup3", 179},
 	{"szlig", 223},
 	{"tau", 964},
 	{"theta", 952},
 	{"thinsp", 8201},
 	{"thorn", 254},
 	{"times", 215},
 	{"trade", 8482},
 	{"uacute", 250},
 	{"ucirc", 251},
 	{"ugrave", 249},
 	{"uml", 168},
 	{"upsilon", 965},
 	{"uuml", 252},
 	{"varepsilon", 8712},
 	{"varphi", 981},
 	{"varpi", 982},
 	{"varrho", 1009},
 	{"vdots", 8942},
 	{"vsigma", 962},
 	{"vtheta", 977},
 	{"xi", 958},
 	{"yacute", 253},
 	{"yen", 165},
 	{"yuml", 255},
 	{"zeta", 950}
 };
 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))

 /* Characters Winstart..Winend are those that Windows */
 /* uses interpolated into the Latin1 set. */
 /* They aren't supposed to appear in HTML, but they do.... */
 enum {
 	Winstart = 127,
 	Winend = 159
 };

 static int	winchars[]= { 8226,	/* 8226 is a bullet */
 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
 	732, 8482, 353, 8250, 339, 8226, 8226, 376};

 static StringInt*	tagtable;		/* initialized from tagnames */
 static StringInt*	attrtable;		/* initialized from attrnames */

 static void		lexinit(void);
 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
 static Rune*		buftostr(Rune* s, Rune* buf, int j);
 static int		comment(TokenSource* ts);
 static int		findstr(TokenSource* ts, Rune* s);
 static int		ampersand(TokenSource* ts);
 /*static int		lowerc(int c); */
 static int		getchar(TokenSource* ts);
 static void		ungetchar(TokenSource* ts, int c);
 static void		backup(TokenSource* ts, int savei);
 /*static void		freeinsidetoken(Token* t); */
 static void		freeattrs(Attr* ahead);
 static Attr*		newattr(int attid, Rune* value, Attr* link);
 static int		Tconv(Fmt* f);

 int	dbglex = 0;
 static int lexinited = 0;

 static void
 lexinit(void)
 {
 	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
 	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
 	tagtable = _makestrinttab(tagnames, Numtags);
 	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
 	attrtable = _makestrinttab(attrnames, Numattrs);
 	fmtinstall('T', Tconv);
 	lexinited = 1;
 }

 static TokenSource*
 newtokensource(uchar* data, int edata, int chset, int mtype)
 {
 	TokenSource*	ans;

 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
 			chset == UTF_8 || chset == Unicode);
 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
 	ans->i = 0;
 	ans->data = data;
 	ans->edata = edata;
 	ans->chset = chset;
 	ans->mtype = mtype;
 	return ans;
 }

 enum {
 	ToksChunk = 500
 };

 /* Call this to get the tokens. */
 /*  The number of returned tokens is returned in *plen. */
 Token*
 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
 {
 	TokenSource*	ts;
 	Token*		a;
 	int	alen;
 	int	ai;
 	int	starti;
 	int	c;
 	int	tag;

 	if(!lexinited)
 		lexinit();
 	ts = newtokensource(data, datalen, chset, mtype);
 	alen = ToksChunk;
 	a = (Token*)emalloc(alen * sizeof(Token));
 	ai = 0;
 	if(dbglex)
 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
 	if(ts->mtype == TextHtml){
 		for(;;){
 			if(ai == alen){
 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
 				alen += ToksChunk;
 			}
 			starti = ts->i;
 			c = getchar(ts);
 			if(c < 0)
 				break;
 			if(c == '<'){
 				tag = gettag(ts, starti, a, &ai);
 				if(tag == Tscript){
 					/* special rules for getting Data after.... */
 					starti = ts->i;
 					c = getchar(ts);
 					tag = getscriptdata(ts, c, starti, a, &ai);
 				}
 			}
 			else
 				tag = getdata(ts, c, starti, a, &ai);
 			if(tag == -1)
 				break;
 			else if(dbglex > 1 && tag != Comment)
 				fprint(2, "lex: got token %T\n", &a[ai-1]);
 		}
 	}
 	else {
 		/* plain text (non-html) tokens */
 		for(;;){
 			if(ai == alen){
 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
 				alen += ToksChunk;
 			}
 			tag = getplaindata(ts, a, &ai);
 			if(tag == -1)
 				break;
 			if(dbglex > 1)
 				fprint(2, "lex: got token %T\n", &a[ai]);
 		}
 	}
 	if(dbglex)
 		fprint(2, "lex: returning %d tokens\n", ai);
 	*plen = ai;
 	if(ai == 0)
 		return nil;
 	return a;
 }

 /* For case where source isn't HTML. */
 /* Just make data tokens, one per line (or partial line, */
 /* at end of buffer), ignoring non-whitespace control */
 /* characters and dumping \r's. */
 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
 /* Otherwise return -1; */
 static int
 getplaindata(TokenSource* ts, Token* a, int* pai)
 {
 	Rune*	s;
 	int	j;
 	int	starti;
 	int	c;
 	Token*	tok;
 	Rune	buf[BIGBUFSIZE];

 	s = nil;
 	j = 0;
 	starti = ts->i;
 	for(c = getchar(ts); c >= 0; c = getchar(ts)){
 		if(c < ' '){
 			if(isspace(c)){
 				if(c == '\r'){
 					/* ignore it unless no following '\n', */
 					/* in which case treat it like '\n' */
 					c = getchar(ts);
 					if(c != '\n'){
 						if(c >= 0)
 							ungetchar(ts, c);
 						c = '\n';
 					}
 				}
 			}
 			else
 				c = 0;
 		}
 		if(c != 0){
 			buf[j++] = c;
 			if(j == sizeof(buf)-1){
 				s = buftostr(s, buf, j);
 				j = 0;
 			}
 		}
 		if(c == '\n')
 			break;
 	}
 	s = buftostr(s, buf, j);
 	if(s == nil)
 		return -1;
 	tok = &a[(*pai)++];
 	tok->tag = Data;
 	tok->text = s;
 	tok->attr = nil;
 	tok->starti = starti;
 	return Data;
 }

 /* Return concatenation of s and buf[0:j] */
 static Rune*
 buftostr(Rune* s, Rune* buf, int j)
 {
 	buf[j] = 0;
 	if(s == nil)
 		s = _Strndup(buf, j);
 	else
 		s = _Strdup2(s, buf);
 	return s;
 }

 /* Gather data up to next start-of-tag or end-of-buffer. */
 /* Translate entity references (&amp;). */
 /* Ignore non-whitespace control characters and get rid of \r's. */
 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
 /* Otherwise return -1; */
 static int
 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
 {
 	Rune*	s;
 	int	j;
 	int	c;
 	Token*	tok;
 	Rune	buf[BIGBUFSIZE];

 	s = nil;
 	j = 0;
 	c = firstc;
 	while(c >= 0){
 		if(c == '&'){
 			c = ampersand(ts);
 			if(c < 0)
 				break;
 		}
 		else if(c < ' '){
 			if(isspace(c)){
 				if(c == '\r'){
 					/* ignore it unless no following '\n', */
 					/* in which case treat it like '\n' */
 					c = getchar(ts);
 					if(c != '\n'){
 						if(c >= 0)
 							ungetchar(ts, c);
 						c = '\n';
 					}
 				}
 			}
 			else {
 				if(warn)
 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
 				c = 0;
 			}
 		}
 		else if(c == '<'){
 			ungetchar(ts, c);
 			break;
 		}
 		if(c != 0){
 			buf[j++] = c;
 			if(j == BIGBUFSIZE-1){
 				s = buftostr(s, buf, j);
 				j = 0;
 			}
 		}
 		c = getchar(ts);
 	}
 	s = buftostr(s, buf, j);
 	if(s == nil)
 		return -1;
 	tok = &a[(*pai)++];
 	tok->tag = Data;
 	tok->text = s;
 	tok->attr = nil;
 	tok->starti = starti;
 	return Data;
 }

 /* The rules for lexing scripts are different (ugh). */
 /* Gather up everything until see a </SCRIPT>. */
 static int
 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
 {
 	Rune*	s;
 	int	j;
 	int	tstarti;
 	int	savei;
 	int	c;
 	int	tag;
 	int	done;
 	Token*	tok;
 	Rune	buf[BIGBUFSIZE];

 	s = nil;
 	j = 0;
 	tstarti = starti;
 	c = firstc;
 	done = 0;
 	while(c >= 0){
 		if(c == '<'){
 			/* other browsers ignore stuff to end of line after <! */
 			savei = ts->i;
 			c = getchar(ts);
 			if(c == '!'){
 				while(c >= 0 && c != '\n' && c != '\r')
 					c = getchar(ts);
 				if(c == '\r')
 					c = getchar(ts);
 				if(c == '\n')
 					c = getchar(ts);
 			}
 			else if(c >= 0){
 				backup(ts, savei);
 				tag = gettag(ts, tstarti, a, pai);
 				if(tag == -1)
 					break;
 				if(tag != Comment)
 					(*pai)--;
 				backup(ts, tstarti);
 				if(tag == Tscript + RBRA){
 					done = 1;
 					break;
 				}
 				/* here tag was not </SCRIPT>, so take as regular data */
 				c = getchar(ts);
 			}
 		}
 		if(c < 0)
 			break;
 		if(c != 0){
 			buf[j++] = c;
 			if(j == BIGBUFSIZE-1){
 				s = buftostr(s, buf, j);
 				j = 0;
 			}
 		}
 		tstarti = ts->i;
 		c = getchar(ts);
 	}
 	if(done || ts->i == ts->edata){
 		s = buftostr(s, buf, j);
 		tok = &a[(*pai)++];
 		tok->tag = Data;
 		tok->text = s;
 		tok->attr = nil;
 		tok->starti = starti;
 		return Data;
 	}
 	backup(ts, starti);
 	return -1;
 }

 /* We've just seen a '<'.  Gather up stuff to closing '>' (if buffer */
 /* ends before then, return -1). */
 /* If it's a tag, look up the name, gather the attributes, and return */
 /* the appropriate token. */
 /* Else it's either just plain data or some kind of ignorable stuff: */
 /* return Data or Comment as appropriate. */
 /* If it's not a Comment, put it in a[*pai] and bump *pai. */
 static int
 gettag(TokenSource* ts, int starti, Token* a, int* pai)
 {
 	int	rbra;
 	int	ans;
 	Attr*	al;
 	int	nexti;
 	int	c;
 	int	ti;
 	int	afnd;
 	int	attid;
 	int	quote;
 	Rune*	val;
 	int	nv;
 	int	i;
 	int	tag;
 	Token*	tok;
 	Rune	buf[BIGBUFSIZE];

 	rbra = 0;
 	nexti = ts->i;
 	tok = &a[*pai];
 	tok->tag = Notfound;
 	tok->text = nil;
 	tok->attr = nil;
 	tok->starti = starti;
 	c = getchar(ts);
 	if(c == '/'){
 		rbra = RBRA;
 		c = getchar(ts);
 	}
 	if(c < 0)
 		goto eob_done;
 	if(c >= 256 || !isalpha(c)){
 		/* not a tag */
 		if(c == '!'){
 			ans = comment(ts);
 			if(ans != -1)
 				return ans;
 			goto eob_done;
 		}
 		else {
 			backup(ts, nexti);
 			tok->tag = Data;
 			tok->text = _Strdup(L(Llt));
 			(*pai)++;
 			return Data;
 		}
 	}
 	/* c starts a tagname */
 	buf[0] = c;
 	i = 1;
 	for(;;){
 		c = getchar(ts);
 		if(c < 0)
 			goto eob_done;
 		if(!ISNAMCHAR(c))
 			break;
 		/* if name is bigger than buf it won't be found anyway... */
 		if(i < BIGBUFSIZE)
 			buf[i++] = c;
 	}
 	if(_lookup(tagtable, Numtags, buf, i, &tag))
 		tok->tag = tag + rbra;
 	else
 		tok->text = _Strndup(buf, i);	/* for warning print, in build */

 	/* attribute gathering loop */
 	al = nil;
 	for(;;){
 		/* look for "ws name" or "ws name ws = ws val"  (ws=whitespace) */
 		/* skip whitespace */
 attrloop_continue:
 		while(c < 256 && isspace(c)){
 			c = getchar(ts);
 			if(c < 0)
 				goto eob_done;
 		}
 		if(c == '>')
 			goto attrloop_done;
 		if(c == '<'){
 			if(warn)
 				fprint(2, "warning: unclosed tag\n");
 			ungetchar(ts, c);
 			goto attrloop_done;
 		}
 		if(c >= 256 || !isalpha(c)){
 			if(warn)
 				fprint(2, "warning: expected attribute name\n");
 			/* skipt to next attribute name */
 			for(;;){
 				c = getchar(ts);
 				if(c < 0)
 					goto eob_done;
 				if(c < 256 && isalpha(c))
 					goto attrloop_continue;
 				if(c == '<'){
 					if(warn)
 						fprint(2, "warning: unclosed tag\n");
 					ungetchar(ts, 60);
 					goto attrloop_done;
 				}
 				if(c == '>')
 					goto attrloop_done;
 			}
 		}
 		/* gather attribute name */
 		buf[0] = c;
 		i = 1;
 		for(;;){
 			c = getchar(ts);
 			if(c < 0)
 				goto eob_done;
 			if(!ISNAMCHAR(c))
 				break;
 			if(i < BIGBUFSIZE-1)
 				buf[i++] = c;
 		}
 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
 		if(warn && !afnd){
 			buf[i] = 0;
 			fprint(2, "warning: unknown attribute name %S\n", buf);
 		}
 		/* skip whitespace */
 		while(c < 256 && isspace(c)){
 			c = getchar(ts);
 			if(c < 0)
 				goto eob_done;
 		}
 		if(c != '='){
 			if(afnd)
 				al = newattr(attid, nil, al);
 			goto attrloop_continue;
 		}
 		/*# c is '=' here;  skip whitespace */
 		for(;;){
 			c = getchar(ts);
 			if(c < 0)
 				goto eob_done;
 			if(c >= 256 || !isspace(c))
 				break;
 		}
 		quote = 0;
 		if(c == '\'' || c == '"'){
 			quote = c;
 			c = getchar(ts);
 			if(c < 0)
 				goto eob_done;
 		}
 		val = nil;
 		nv = 0;
 		for(;;){
 valloop_continue:
 			if(c < 0)
 				goto eob_done;
 			if(c == '>'){
 				if(quote){
 					/* c might be part of string (though not good style) */
 					/* but if line ends before close quote, assume */
 					/* there was an unmatched quote */
 					ti = ts->i;
 					for(;;){
 						c = getchar(ts);
 						if(c < 0)
 							goto eob_done;
 						if(c == quote){
 							backup(ts, ti);
 							buf[nv++] = '>';
 							if(nv == BIGBUFSIZE-1){
 								val = buftostr(val, buf, nv);
 								nv = 0;
 							}
 							c = getchar(ts);
 							goto valloop_continue;
 						}
 						if(c == '\n'){
 							if(warn)
 								fprint(2, "warning: apparent unmatched quote\n");
 							backup(ts, ti);
 							c = '>';
 							goto valloop_done;
 						}
 					}
 				}
 				else
 					goto valloop_done;
 			}
 			if(quote){
 				if(c == quote){
 					c = getchar(ts);
 					if(c < 0)
 						goto eob_done;
 					goto valloop_done;
 				}
 				if(c == '\r'){
 					c = getchar(ts);
 					goto valloop_continue;
 				}
 				if(c == '\t' || c == '\n')
 					c = ' ';
 			}
 			else {
 				if(c < 256 && isspace(c))
 					goto valloop_done;
 			}
 			if(c == '&'){
 				c = ampersand(ts);
 				if(c == -1)
 					goto eob_done;
 			}
 			buf[nv++] = c;
 			if(nv == BIGBUFSIZE-1){
 				val = buftostr(val, buf, nv);
 				nv = 0;
 			}
 			c = getchar(ts);
 		}
 valloop_done:
 		if(afnd){
 			val = buftostr(val, buf, nv);
 			al = newattr(attid, val, al);
 		}
 	}

 attrloop_done:
 	tok->attr = al;
 	(*pai)++;
 	return tok->tag;

 eob_done:
 	if(warn)
 		fprint(2, "warning: incomplete tag at end of page\n");
 	backup(ts, nexti);
 	tok->tag = Data;
 	tok->text = _Strdup(L(Llt));
 	return Data;
 }

 /* We've just read a '<!' at position starti, */
 /* so this may be a comment or other ignored section, or it may */
 /* be just a literal string if there is no close before end of file */
 /* (other browsers do that). */
 /* The accepted practice seems to be (note: contrary to SGML spec!): */
 /* If see <!--, look for --> to close, or if none, > to close. */
 /* If see <!(not --), look for > to close. */
 /* If no close before end of file, leave original characters in as literal data. */
 /* */
 /* If we see ignorable stuff, return Comment. */
 /* Else return nil (caller should back up and try again when more data arrives, */
 /* unless at end of file, in which case caller should just make '<' a data token). */
 static int
 comment(TokenSource* ts)
 {
 	int	nexti;
 	int	havecomment;
 	int	c;

 	nexti = ts->i;
 	havecomment = 0;
 	c = getchar(ts);
 	if(c == '-'){
 		c = getchar(ts);
 		if(c == '-'){
 			if(findstr(ts, L(Larrow)))
 				havecomment = 1;
 			else
 				backup(ts, nexti);
 		}
 	}
 	if(!havecomment){
 		if(c == '>')
 			havecomment = 1;
 		else if(c >= 0){
 			if(findstr(ts, L(Lgt)))
 				havecomment = 1;
 		}
 	}
 	if(havecomment)
 		return Comment;
 	return -1;
 }

 /* Look for string s in token source. */
 /* If found, return 1, with buffer at next char after s, */
 /* else return 0 (caller should back up). */
 static int
 findstr(TokenSource* ts, Rune* s)
 {
 	int	c0;
 	int	n;
 	int	nexti;
 	int	i;
 	int	c;

 	c0 = s[0];
 	n = runestrlen(s);
 	for(;;){
 		c = getchar(ts);
 		if(c < 0)
 			break;
 		if(c == c0){
 			if(n == 1)
 				return 1;
 			nexti = ts->i;
 			for(i = 1; i < n; i++){
 				c = getchar(ts);
 				if(c < 0)
 					goto mainloop_done;
 				if(c != s[i])
 					break;
 			}
 			if(i == n)
 				return 1;
 			backup(ts, nexti);
 		}
 	}
 mainloop_done:
 	return 0;
 }

 static int
 xdigit(int c)
 {
 	if('0' <= c && c <= '9')
 		return c-'0';
 	if('a' <= c && c <= 'f')
 		return c-'a'+10;
 	if('A' <= c && c <= 'F')
 		return c-'A'+10;
 	return -1;
 }

 /* We've just read an '&'; look for an entity reference */
 /* name, and if found, return translated char. */
 /* if there is a complete entity name but it isn't known, */
 /* try prefixes (gets around some buggy HTML out there), */
 /* and if that fails, back up to just past the '&' and return '&'. */
 /* If the entity can't be completed in the current buffer, back up */
 /* to the '&' and return -1. */
 static int
 ampersand(TokenSource* ts)
 {
 	int	savei;
 	int	c;
 	int	fnd;
 	int	ans;
 	int	v;
 	int	i;
 	int	k;
 	Rune	buf[SMALLBUFSIZE];

 	savei = ts->i;
 	c = getchar(ts);
 	fnd = 0;
 	ans = -1;
 	if(c == '#'){
 		c = getchar(ts);
 		v = 0;
 		if(c == 'x'){
 			c = getchar(ts);
 			while((i=xdigit(c)) != -1){
 				v = v*16 + i;
 				c = getchar(ts);
 			}
 		}else{
 			while('0' <= c && c <= '9'){
 				v = v*10 + c - '0';
 				c = getchar(ts);
 			}
 		}
 		if(c >= 0){
 			if(!(c == ';' || c == '\n' || c == '\r'))
 				ungetchar(ts, c);
 			c = v;
 			if(c == 160)
 				c = 160;
 			if(c >= Winstart && c <= Winend){
 				c = winchars[c - Winstart];
 			}
 			ans = c;
 			fnd = 1;
 		}
 	}
 	else if(c < 256 && isalpha(c)){
 		buf[0] = c;
 		k = 1;
 		for(;;){
 			c = getchar(ts);
 			if(c < 0)
 				break;
 			if(ISNAMCHAR(c)){
 				if(k < SMALLBUFSIZE-1)
 					buf[k++] = c;
 			}
 			else {
 				if(!(c == ';' || c == '\n' || c == '\r'))
 					ungetchar(ts, c);
 				break;
 			}
 		}
 		if(c >= 0){
 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
 			if(!fnd){
 				/* Try prefixes of s */
 				if(c == ';' || c == '\n' || c == '\r')
 					ungetchar(ts, c);
 				i = k;
 				while(--k > 0){
 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
 					if(fnd){
 						while(i > k){
 							i--;
 							ungetchar(ts, buf[i]);
 						}
 						break;
 					}
 				}
 			}
 		}
 	}
 	if(!fnd){
 		backup(ts, savei);
 		ans = '&';
 	}
 	return ans;
 }

 /* Get next char, obeying ts.chset. */
 /* Returns -1 if no complete character left before current end of data. */
 static int
 getchar(TokenSource* ts)
 {
 	uchar*	buf;
 	int	c;
 	int	n;
 	int	ok;
 	Rune	r;

 	if(ts->i >= ts->edata)
 		return -1;
 	buf = ts->data;
 	c = buf[ts->i];
 	switch(ts->chset){
 	case ISO_8859_1:
 		if(c >= Winstart && c <= Winend)
 			c = winchars[c - Winstart];
 		ts->i++;
 		break;
 	case US_Ascii:
 		if(c > 127){
 			if(warn)
 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
 		}
 		ts->i++;
 		break;
 	case UTF_8:
 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
 		n = chartorune(&r, (char*)(buf+ts->i));
 		if(ok){
 			if(warn && c == 0x80)
 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
 			ts->i += n;
 			c = r;
 		}
 		else {
 			/* not enough bytes in buf to complete utf-8 char */
 			ts->i = ts->edata;	/* mark "all used" */
 			c = -1;
 		}
 		break;
 	case Unicode:
 		if(ts->i < ts->edata - 1){
 			/*standards say most-significant byte first */
 			c = (c << 8)|(buf[ts->i + 1]);
 			ts->i += 2;
 		}
 		else {
 			ts->i = ts->edata;	/* mark "all used" */
 			c = -1;
 		}
 		break;
 	}
 	return c;
 }

 /* Assuming c was the last character returned by getchar, set */
 /* things up so that next getchar will get that same character */
 /* followed by the current 'next character', etc. */
 static void
 ungetchar(TokenSource* ts, int c)
 {
 	int	n;
 	Rune	r;
 	char	a[UTFmax];

 	n = 1;
 	switch(ts->chset){
 	case UTF_8:
 		if(c >= 128){
 			r = c;
 			n = runetochar(a, &r);
 		}
 		break;
 	case Unicode:
 		n = 2;
 		break;
 	}
 	ts->i -= n;
 }

 /* Restore ts so that it is at the state where the index was savei. */
 static void
 backup(TokenSource* ts, int savei)
 {
 	if(dbglex)
 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
 	ts->i = savei;
 }


 /* Look for value associated with attribute attid in token t. */
 /* If there is one, return 1 and put the value in *pans, */
 /* else return 0. */
 /* If xfer is true, transfer ownership of the string to the caller */
 /* (nil it out here); otherwise, caller must duplicate the answer */
 /* if it needs to save it. */
 /* OK to have pans==0, in which case this is just looking */
 /* to see if token is present. */
 int
 _tokaval(Token* t, int attid, Rune** pans, int xfer)
 {
 	Attr*	attr;

 	attr = t->attr;
 	while(attr != nil){
 		if(attr->attid == attid){
 			if(pans != nil)
 				*pans = attr->value;
 			if(xfer)
 				attr->value = nil;
 			return 1;
 		}
 		attr = attr->next;
 	}
 	if(pans != nil)
 		*pans = nil;
 	return 0;
 }

 static int
 Tconv(Fmt *f)
 {
 	Token*	t;
 	int	i;
 	int	tag;
 	char*	srbra;
 	Rune*	aname;
 	Rune*	tname;
 	Attr*	a;
 	char	buf[BIGBUFSIZE];

 	t = va_arg(f->args, Token*);
 	if(t == nil)
 		sprint(buf, "<null>");
 	else {
 		i = 0;
 		if(dbglex > 1)
 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
 		tag = t->tag;
 		if(tag == Data){
 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
 		}
 		else {
 			srbra = "";
 			if(tag >= RBRA){
 				tag -= RBRA;
 				srbra = "/";
 			}
 			tname = tagnames[tag];
 			if(tag == Notfound)
 				tname = L(Lquestion);
 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
 			for(a = t->attr; a != nil; a = a->next){
 				aname = attrnames[a->attid];
 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
 				if(a->value != nil)
 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
 			}
 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
 		}
 		buf[i] = 0;
 	}
 	return fmtstrcpy(f, buf);
 }

 /* Attrs own their constituent strings, but build may eventually */
 /* transfer some values to its items and nil them out in the Attr. */
 static Attr*
 newattr(int attid, Rune* value, Attr* link)
 {
 	Attr* ans;

 	ans = (Attr*)emalloc(sizeof(Attr));
 	ans->attid = attid;
 	ans->value = value;
 	ans->next = link;
 	return ans;
 }

 /* Free list of Attrs linked through next field */
 static void
 freeattrs(Attr* ahead)
 {
 	Attr* a;
 	Attr* nexta;

 	a = ahead;
 	while(a != nil){
 		nexta = a->next;
 		free(a->value);
 		free(a);
 		a = nexta;
 	}
 }

 /* Free array of Tokens. */
 /* Allocated space might have room for more than n tokens, */
 /* but only n of them are initialized. */
 /* If caller has transferred ownership of constitutent strings */
 /* or attributes, it must have nil'd out the pointers in the Tokens. */
 void
 _freetokens(Token* tarray, int n)
 {
 	int i;
 	Token* t;

 	if(tarray == nil)
 		return;
 	for(i = 0; i < n; i++){
 		t = &tarray[i];
 		free(t->text);
 		freeattrs(t->attr);
 	}
 	free(tarray);
 }