Blame - src/libhtml/lex.c - plan9

blob: 9a47dde8c6d67afcde5d9fcd1b26909ace0f5aed [file] [log] [blame]

wkj	7cf289c	2004-04-06 19:06:52 +0000	[diff] [blame]	1	#include <u.h>
				2	#include <libc.h>
				3	#include <draw.h>
				4	#include <ctype.h>
				5	#include <html.h>
				6	#include "impl.h"
				7
				8	typedef struct TokenSource TokenSource;
				9	struct TokenSource
				10	{
				11	int i; // index of next byte to use
				12	uchar* data; // all the data
				13	int edata; // data[0:edata] is valid
				14	int chset; // one of US_Ascii, etc.
				15	int mtype; // TextHtml or TextPlain
				16	};
				17
				18	enum {
				19	EOF = -2,
				20	EOB = -1
				21	};
				22
				23	#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) \|\| isdigit(c) \|\| (c) == '-' \|\| (c) == '.'))
				24
				25	#define SMALLBUFSIZE 240
				26	#define BIGBUFSIZE 2000
				27
				28	// HTML 4.0 tag names.
				29	// Keep sorted, and in correspondence with enum in iparse.h.
				30	Rune **tagnames;
				31	char *_tagnames[] = {
				32	" ",
				33	"!",
				34	"a",
				35	"abbr",
				36	"acronym",
				37	"address",
				38	"applet",
				39	"area",
				40	"b",
				41	"base",
				42	"basefont",
				43	"bdo",
				44	"big",
				45	"blink",
				46	"blockquote",
				47	"body",
				48	"bq",
				49	"br",
				50	"button",
				51	"caption",
				52	"center",
				53	"cite",
				54	"code",
				55	"col",
				56	"colgroup",
				57	"dd",
				58	"del",
				59	"dfn",
				60	"dir",
				61	"div",
				62	"dl",
				63	"dt",
				64	"em",
				65	"fieldset",
				66	"font",
				67	"form",
				68	"frame",
				69	"frameset",
				70	"h1",
				71	"h2",
				72	"h3",
				73	"h4",
				74	"h5",
				75	"h6",
				76	"head",
				77	"hr",
				78	"html",
				79	"i",
				80	"iframe",
				81	"img",
				82	"input",
				83	"ins",
				84	"isindex",
				85	"kbd",
				86	"label",
				87	"legend",
				88	"li",
				89	"link",
				90	"map",
				91	"menu",
				92	"meta",
				93	"nobr",
				94	"noframes",
				95	"noscript",
				96	"object",
				97	"ol",
				98	"optgroup",
				99	"option",
				100	"p",
				101	"param",
				102	"pre",
				103	"q",
				104	"s",
				105	"samp",
				106	"script",
				107	"select",
				108	"small",
				109	"span",
				110	"strike",
				111	"strong",
				112	"style",
				113	"sub",
				114	"sup",
				115	"table",
				116	"tbody",
				117	"td",
				118	"textarea",
				119	"tfoot",
				120	"th",
				121	"thead",
				122	"title",
				123	"tr",
				124	"tt",
				125	"u",
				126	"ul",
				127	"var"
				128	};
				129
				130	// HTML 4.0 attribute names.
				131	// Keep sorted, and in correspondence with enum in i.h.
				132	Rune **attrnames;
				133	char* _attrnames[] = {
				134	"abbr",
				135	"accept-charset",
				136	"access-key",
				137	"action",
				138	"align",
				139	"alink",
				140	"alt",
				141	"archive",
				142	"axis",
				143	"background",
				144	"bgcolor",
				145	"border",
				146	"cellpadding",
				147	"cellspacing",
				148	"char",
				149	"charoff",
				150	"charset",
				151	"checked",
				152	"cite",
				153	"class",
				154	"classid",
				155	"clear",
				156	"code",
				157	"codebase",
				158	"codetype",
				159	"color",
				160	"cols",
				161	"colspan",
				162	"compact",
				163	"content",
				164	"coords",
				165	"data",
				166	"datetime",
				167	"declare",
				168	"defer",
				169	"dir",
				170	"disabled",
				171	"enctype",
				172	"face",
				173	"for",
				174	"frame",
				175	"frameborder",
				176	"headers",
				177	"height",
				178	"href",
				179	"hreflang",
				180	"hspace",
				181	"http-equiv",
				182	"id",
				183	"ismap",
				184	"label",
				185	"lang",
				186	"link",
				187	"longdesc",
				188	"marginheight",
				189	"marginwidth",
				190	"maxlength",
				191	"media",
				192	"method",
				193	"multiple",
				194	"name",
				195	"nohref",
				196	"noresize",
				197	"noshade",
				198	"nowrap",
				199	"object",
				200	"onblur",
				201	"onchange",
				202	"onclick",
				203	"ondblclick",
				204	"onfocus",
				205	"onkeypress",
				206	"onkeyup",
				207	"onload",
				208	"onmousedown",
				209	"onmousemove",
				210	"onmouseout",
				211	"onmouseover",
				212	"onmouseup",
				213	"onreset",
				214	"onselect",
				215	"onsubmit",
				216	"onunload",
				217	"profile",
				218	"prompt",
				219	"readonly",
				220	"rel",
				221	"rev",
				222	"rows",
				223	"rowspan",
				224	"rules",
				225	"scheme",
				226	"scope",
				227	"scrolling",
				228	"selected",
				229	"shape",
				230	"size",
				231	"span",
				232	"src",
				233	"standby",
				234	"start",
				235	"style",
				236	"summary",
				237	"tabindex",
				238	"target",
				239	"text",
				240	"title",
				241	"type",
				242	"usemap",
				243	"valign",
				244	"value",
				245	"valuetype",
				246	"version",
				247	"vlink",
				248	"vspace",
				249	"width"
				250	};
				251
				252
				253	// Character entity to unicode character number map.
				254	// Keep sorted by name.
				255	StringInt *chartab;
				256	AsciiInt _chartab[142] = {
				257	{"AElig", 198},
				258	{"Aacute", 193},
				259	{"Acirc", 194},
				260	{"Agrave", 192},
				261	{"Aring", 197},
				262	{"Atilde", 195},
				263	{"Auml", 196},
				264	{"Ccedil", 199},
				265	{"ETH", 208},
				266	{"Eacute", 201},
				267	{"Ecirc", 202},
				268	{"Egrave", 200},
				269	{"Euml", 203},
				270	{"Iacute", 205},
				271	{"Icirc", 206},
				272	{"Igrave", 204},
				273	{"Iuml", 207},
				274	{"Ntilde", 209},
				275	{"Oacute", 211},
				276	{"Ocirc", 212},
				277	{"Ograve", 210},
				278	{"Oslash", 216},
				279	{"Otilde", 213},
				280	{"Ouml", 214},
				281	{"THORN", 222},
				282	{"Uacute", 218},
				283	{"Ucirc", 219},
				284	{"Ugrave", 217},
				285	{"Uuml", 220},
				286	{"Yacute", 221},
				287	{"aacute", 225},
				288	{"acirc", 226},
				289	{"acute", 180},
				290	{"aelig", 230},
				291	{"agrave", 224},
				292	{"alpha", 945},
				293	{"amp", 38},
				294	{"aring", 229},
				295	{"atilde", 227},
				296	{"auml", 228},
				297	{"beta", 946},
				298	{"brvbar", 166},
				299	{"ccedil", 231},
				300	{"cdots", 8943},
				301	{"cedil", 184},
				302	{"cent", 162},
				303	{"chi", 967},
				304	{"copy", 169},
				305	{"curren", 164},
				306	{"ddots", 8945},
				307	{"deg", 176},
				308	{"delta", 948},
				309	{"divide", 247},
				310	{"eacute", 233},
				311	{"ecirc", 234},
				312	{"egrave", 232},
				313	{"emdash", 8212},
				314	{"emsp", 8195},
				315	{"endash", 8211},
				316	{"ensp", 8194},
				317	{"epsilon", 949},
				318	{"eta", 951},
				319	{"eth", 240},
				320	{"euml", 235},
				321	{"frac12", 189},
				322	{"frac14", 188},
				323	{"frac34", 190},
				324	{"gamma", 947},
				325	{"gt", 62},
				326	{"iacute", 237},
				327	{"icirc", 238},
				328	{"iexcl", 161},
				329	{"igrave", 236},
				330	{"iota", 953},
				331	{"iquest", 191},
				332	{"iuml", 239},
				333	{"kappa", 954},
				334	{"lambda", 955},
				335	{"laquo", 171},
				336	{"ldots", 8230},
				337	{"lt", 60},
				338	{"macr", 175},
				339	{"micro", 181},
				340	{"middot", 183},
				341	{"mu", 956},
				342	{"nbsp", 160},
				343	{"not", 172},
				344	{"ntilde", 241},
				345	{"nu", 957},
				346	{"oacute", 243},
				347	{"ocirc", 244},
				348	{"ograve", 242},
				349	{"omega", 969},
				350	{"omicron", 959},
				351	{"ordf", 170},
				352	{"ordm", 186},
				353	{"oslash", 248},
				354	{"otilde", 245},
				355	{"ouml", 246},
				356	{"para", 182},
				357	{"phi", 966},
				358	{"pi", 960},
				359	{"plusmn", 177},
				360	{"pound", 163},
				361	{"psi", 968},
				362	{"quad", 8193},
				363	{"quot", 34},
				364	{"raquo", 187},
				365	{"reg", 174},
				366	{"rho", 961},
				367	{"sect", 167},
				368	{"shy", 173},
				369	{"sigma", 963},
				370	{"sp", 8194},
				371	{"sup1", 185},
				372	{"sup2", 178},
				373	{"sup3", 179},
				374	{"szlig", 223},
				375	{"tau", 964},
				376	{"theta", 952},
				377	{"thinsp", 8201},
				378	{"thorn", 254},
				379	{"times", 215},
				380	{"trade", 8482},
				381	{"uacute", 250},
				382	{"ucirc", 251},
				383	{"ugrave", 249},
				384	{"uml", 168},
				385	{"upsilon", 965},
				386	{"uuml", 252},
				387	{"varepsilon", 8712},
				388	{"varphi", 981},
				389	{"varpi", 982},
				390	{"varrho", 1009},
				391	{"vdots", 8942},
				392	{"vsigma", 962},
				393	{"vtheta", 977},
				394	{"xi", 958},
				395	{"yacute", 253},
				396	{"yen", 165},
				397	{"yuml", 255},
				398	{"zeta", 950}
				399	};
				400	#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
				401
				402	// Characters Winstart..Winend are those that Windows
				403	// uses interpolated into the Latin1 set.
				404	// They aren't supposed to appear in HTML, but they do....
				405	enum {
				406	Winstart = 127,
				407	Winend = 159
				408	};
				409
				410	static int winchars[]= { 8226, // 8226 is a bullet
				411	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
				412	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
				413	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
				414	732, 8482, 353, 8250, 339, 8226, 8226, 376};
				415
				416	static StringInt* tagtable; // initialized from tagnames
				417	static StringInt* attrtable; // initialized from attrnames
				418
				419	static void lexinit();
				420	static int getplaindata(TokenSource* ts, Token* a, int* pai);
				421	static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
				422	static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
				423	static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
				424	static Rune* buftostr(Rune* s, Rune* buf, int j);
				425	static int comment(TokenSource* ts);
				426	static int findstr(TokenSource* ts, Rune* s);
				427	static int ampersand(TokenSource* ts);
				428	//static int lowerc(int c);
				429	static int getchar(TokenSource* ts);
				430	static void ungetchar(TokenSource* ts, int c);
				431	static void backup(TokenSource* ts, int savei);
				432	//static void freeinsidetoken(Token* t);
				433	static void freeattrs(Attr* ahead);
				434	static Attr* newattr(int attid, Rune* value, Attr* link);
				435	static int Tconv(Fmt* f);
				436
				437	int dbglex = 0;
				438	static int lexinited = 0;
				439
				440	static void
				441	lexinit(void)
				442	{
rsc	7e19561	2005-01-04 22:20:21 +0000	[diff] [blame]	443	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
				444	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
wkj	7cf289c	2004-04-06 19:06:52 +0000	[diff] [blame]	445	tagtable = _makestrinttab(tagnames, Numtags);
rsc	7e19561	2005-01-04 22:20:21 +0000	[diff] [blame]	446	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
wkj	7cf289c	2004-04-06 19:06:52 +0000	[diff] [blame]	447	attrtable = _makestrinttab(attrnames, Numattrs);
				448	fmtinstall('T', Tconv);
				449	lexinited = 1;
				450	}
				451
				452	static TokenSource*
				453	newtokensource(uchar* data, int edata, int chset, int mtype)
				454	{
				455	TokenSource* ans;
				456
				457	assert(chset == US_Ascii \|\| chset == ISO_8859_1 \|\|
				458	chset == UTF_8 \|\| chset == Unicode);
				459	ans = (TokenSource*)emalloc(sizeof(TokenSource));
				460	ans->i = 0;
				461	ans->data = data;
				462	ans->edata = edata;
				463	ans->chset = chset;
				464	ans->mtype = mtype;
				465	return ans;
				466	}
				467
				468	enum {
				469	ToksChunk = 500
				470	};
				471
				472	// Call this to get the tokens.
				473	// The number of returned tokens is returned in *plen.
				474	Token*
				475	_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
				476	{
				477	TokenSource* ts;
				478	Token* a;
				479	int alen;
				480	int ai;
				481	int starti;
				482	int c;
				483	int tag;
				484
				485	if(!lexinited)
				486	lexinit();
				487	ts = newtokensource(data, datalen, chset, mtype);
				488	alen = ToksChunk;
				489	a = (Token)emalloc(alen sizeof(Token));
				490	ai = 0;
				491	if(dbglex)
				492	fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
				493	if(ts->mtype == TextHtml) {
				494	for(;;) {
				495	if(ai == alen) {
				496	a = (Token)erealloc(a, (alen+ToksChunk)sizeof(Token));
				497	alen += ToksChunk;
				498	}
				499	starti = ts->i;
				500	c = getchar(ts);
				501	if(c < 0)
				502	break;
				503	if(c == '<') {
				504	tag = gettag(ts, starti, a, &ai);
				505	if(tag == Tscript) {
				506	// special rules for getting Data after....
				507	starti = ts->i;
				508	c = getchar(ts);
				509	tag = getscriptdata(ts, c, starti, a, &ai);
				510	}
				511	}
				512	else
				513	tag = getdata(ts, c, starti, a, &ai);
				514	if(tag == -1)
				515	break;
				516	else if(dbglex > 1 && tag != Comment)
				517	fprint(2, "lex: got token %T\n", &a[ai-1]);
				518	}
				519	}
				520	else {
				521	// plain text (non-html) tokens
				522	for(;;) {
				523	if(ai == alen) {
				524	a = (Token)erealloc(a, (alen+ToksChunk)sizeof(Token));
				525	alen += ToksChunk;
				526	}
				527	tag = getplaindata(ts, a, &ai);
				528	if(tag == -1)
				529	break;
				530	if(dbglex > 1)
				531	fprint(2, "lex: got token %T\n", &a[ai]);
				532	}
				533	}
				534	if(dbglex)
				535	fprint(2, "lex: returning %d tokens\n", ai);
				536	*plen = ai;
				537	if(ai == 0)
				538	return nil;
				539	return a;
				540	}
				541
				542	// For case where source isn't HTML.
				543	// Just make data tokens, one per line (or partial line,
				544	// at end of buffer), ignoring non-whitespace control
				545	// characters and dumping \r's.
				546	// If find non-empty token, fill in a[pai], bump pai, and return Data.
				547	// Otherwise return -1;
				548	static int
				549	getplaindata(TokenSource* ts, Token* a, int* pai)
				550	{
				551	Rune* s;
				552	int j;
				553	int starti;
				554	int c;
				555	Token* tok;
				556	Rune buf[BIGBUFSIZE];
				557
				558	s = nil;
				559	j = 0;
				560	starti = ts->i;
				561	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
				562	if(c < ' ') {
				563	if(isspace(c)) {
				564	if(c == '\r') {
				565	// ignore it unless no following '\n',
				566	// in which case treat it like '\n'
				567	c = getchar(ts);
				568	if(c != '\n') {
				569	if(c >= 0)
				570	ungetchar(ts, c);
				571	c = '\n';
				572	}
				573	}
				574	}
				575	else
				576	c = 0;
				577	}
				578	if(c != 0) {
				579	buf[j++] = c;
				580	if(j == sizeof(buf)-1) {
				581	s = buftostr(s, buf, j);
				582	j = 0;
				583	}
				584	}
				585	if(c == '\n')
				586	break;
				587	}
				588	s = buftostr(s, buf, j);
				589	if(s == nil)
				590	return -1;
				591	tok = &a[(*pai)++];
				592	tok->tag = Data;
				593	tok->text = s;
				594	tok->attr = nil;
				595	tok->starti = starti;
				596	return Data;
				597	}
				598
				599	// Return concatenation of s and buf[0:j]
				600	static Rune*
				601	buftostr(Rune* s, Rune* buf, int j)
				602	{
				603	buf[j] = 0;
				604	if(s == nil)
				605	s = _Strndup(buf, j);
				606	else
				607	s = _Strdup2(s, buf);
				608	return s;
				609	}
				610
				611	// Gather data up to next start-of-tag or end-of-buffer.
				612	// Translate entity references (&).
				613	// Ignore non-whitespace control characters and get rid of \r's.
				614	// If find non-empty token, fill in a[pai], bump pai, and return Data.
				615	// Otherwise return -1;
				616	static int
				617	getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
				618	{
				619	Rune* s;
				620	int j;
				621	int c;
				622	Token* tok;
				623	Rune buf[BIGBUFSIZE];
				624
				625	s = nil;
				626	j = 0;
				627	c = firstc;
				628	while(c >= 0) {
				629	if(c == '&') {
				630	c = ampersand(ts);
				631	if(c < 0)
				632	break;
				633	}
				634	else if(c < ' ') {
				635	if(isspace(c)) {
				636	if(c == '\r') {
				637	// ignore it unless no following '\n',
				638	// in which case treat it like '\n'
				639	c = getchar(ts);
				640	if(c != '\n') {
				641	if(c >= 0)
				642	ungetchar(ts, c);
				643	c = '\n';
				644	}
				645	}
				646	}
				647	else {
				648	if(warn)
				649	fprint(2, "warning: non-whitespace control character %d ignored\n", c);
				650	c = 0;
				651	}
				652	}
				653	else if(c == '<') {
				654	ungetchar(ts, c);
				655	break;
				656	}
				657	if(c != 0) {
				658	buf[j++] = c;
				659	if(j == BIGBUFSIZE-1) {
				660	s = buftostr(s, buf, j);
				661	j = 0;
				662	}
				663	}
				664	c = getchar(ts);
				665	}
				666	s = buftostr(s, buf, j);
				667	if(s == nil)
				668	return -1;
				669	tok = &a[(*pai)++];
				670	tok->tag = Data;
				671	tok->text = s;
				672	tok->attr = nil;
				673	tok->starti = starti;
				674	return Data;
				675	}
				676
				677	// The rules for lexing scripts are different (ugh).
				678	// Gather up everything until see a </SCRIPT>.
				679	static int
				680	getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
				681	{
				682	Rune* s;
				683	int j;
				684	int tstarti;
				685	int savei;
				686	int c;
				687	int tag;
				688	int done;
				689	Token* tok;
				690	Rune buf[BIGBUFSIZE];
				691
				692	s = nil;
				693	j = 0;
				694	tstarti = starti;
				695	c = firstc;
				696	done = 0;
				697	while(c >= 0) {
				698	if(c == '<') {
				699	// other browsers ignore stuff to end of line after <!
				700	savei = ts->i;
				701	c = getchar(ts);
				702	if(c == '!') {
				703	while(c >= 0 && c != '\n' && c != '\r')
				704	c = getchar(ts);
				705	if(c == '\r')
				706	c = getchar(ts);
				707	if(c == '\n')
				708	c = getchar(ts);
				709	}
				710	else if(c >= 0) {
				711	backup(ts, savei);
				712	tag = gettag(ts, tstarti, a, pai);
				713	if(tag == -1)
				714	break;
				715	if(tag != Comment)
				716	(*pai)--;
				717	backup(ts, tstarti);
				718	if(tag == Tscript + RBRA) {
				719	done = 1;
				720	break;
				721	}
				722	// here tag was not </SCRIPT>, so take as regular data
				723	c = getchar(ts);
				724	}
				725	}
				726	if(c < 0)
				727	break;
				728	if(c != 0) {
				729	buf[j++] = c;
				730	if(j == BIGBUFSIZE-1) {
				731	s = buftostr(s, buf, j);
				732	j = 0;
				733	}
				734	}
				735	tstarti = ts->i;
				736	c = getchar(ts);
				737	}
				738	if(done \|\| ts->i == ts->edata) {
				739	s = buftostr(s, buf, j);
				740	tok = &a[(*pai)++];
				741	tok->tag = Data;
				742	tok->text = s;
				743	tok->attr = nil;
				744	tok->starti = starti;
				745	return Data;
				746	}
				747	backup(ts, starti);
				748	return -1;
				749	}
				750
				751	// We've just seen a '<'. Gather up stuff to closing '>' (if buffer
				752	// ends before then, return -1).
				753	// If it's a tag, look up the name, gather the attributes, and return
				754	// the appropriate token.
				755	// Else it's either just plain data or some kind of ignorable stuff:
				756	// return Data or Comment as appropriate.
				757	// If it's not a Comment, put it in a[pai] and bump pai.
				758	static int
				759	gettag(TokenSource* ts, int starti, Token* a, int* pai)
				760	{
				761	int rbra;
				762	int ans;
				763	Attr* al;
				764	int nexti;
				765	int c;
				766	int ti;
				767	int afnd;
				768	int attid;
				769	int quote;
				770	Rune* val;
				771	int nv;
				772	int i;
				773	int tag;
				774	Token* tok;
				775	Rune buf[BIGBUFSIZE];
				776
				777	rbra = 0;
				778	nexti = ts->i;
				779	tok = &a[*pai];
				780	tok->tag = Notfound;
				781	tok->text = nil;
				782	tok->attr = nil;
				783	tok->starti = starti;
				784	c = getchar(ts);
				785	if(c == '/') {
				786	rbra = RBRA;
				787	c = getchar(ts);
				788	}
				789	if(c < 0)
				790	goto eob_done;
				791	if(c >= 256 \|\| !isalpha(c)) {
				792	// not a tag
				793	if(c == '!') {
				794	ans = comment(ts);
				795	if(ans != -1)
				796	return ans;
				797	goto eob_done;
				798	}
				799	else {
				800	backup(ts, nexti);
				801	tok->tag = Data;
				802	tok->text = _Strdup(L(Llt));
				803	(*pai)++;
				804	return Data;
				805	}
				806	}
				807	// c starts a tagname
				808	buf[0] = c;
				809	i = 1;
				810	while(1) {
				811	c = getchar(ts);
				812	if(c < 0)
				813	goto eob_done;
				814	if(!ISNAMCHAR(c))
				815	break;
				816	// if name is bigger than buf it won't be found anyway...
				817	if(i < BIGBUFSIZE)
				818	buf[i++] = c;
				819	}
				820	if(_lookup(tagtable, Numtags, buf, i, &tag))
				821	tok->tag = tag + rbra;
				822	else
				823	tok->text = _Strndup(buf, i); // for warning print, in build
				824
				825	// attribute gathering loop
				826	al = nil;
				827	while(1) {
				828	// look for "ws name" or "ws name ws = ws val" (ws=whitespace)
				829	// skip whitespace
				830	attrloop_continue:
				831	while(c < 256 && isspace(c)) {
				832	c = getchar(ts);
				833	if(c < 0)
				834	goto eob_done;
				835	}
				836	if(c == '>')
				837	goto attrloop_done;
				838	if(c == '<') {
				839	if(warn)
				840	fprint(2, "warning: unclosed tag\n");
				841	ungetchar(ts, c);
				842	goto attrloop_done;
				843	}
				844	if(c >= 256 \|\| !isalpha(c)) {
				845	if(warn)
				846	fprint(2, "warning: expected attribute name\n");
				847	// skipt to next attribute name
				848	while(1) {
				849	c = getchar(ts);
				850	if(c < 0)
				851	goto eob_done;
				852	if(c < 256 && isalpha(c))
				853	goto attrloop_continue;
				854	if(c == '<') {
				855	if(warn)
				856	fprint(2, "warning: unclosed tag\n");
				857	ungetchar(ts, 60);
				858	goto attrloop_done;
				859	}
				860	if(c == '>')
				861	goto attrloop_done;
				862	}
				863	}
				864	// gather attribute name
				865	buf[0] = c;
				866	i = 1;
				867	while(1) {
				868	c = getchar(ts);
				869	if(c < 0)
				870	goto eob_done;
				871	if(!ISNAMCHAR(c))
				872	break;
				873	if(i < BIGBUFSIZE-1)
				874	buf[i++] = c;
				875	}
				876	afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
				877	if(warn && !afnd) {
				878	buf[i] = 0;
				879	fprint(2, "warning: unknown attribute name %S\n", buf);
				880	}
				881	// skip whitespace
				882	while(c < 256 && isspace(c)) {
				883	c = getchar(ts);
				884	if(c < 0)
				885	goto eob_done;
				886	}
				887	if(c != '=') {
				888	if(afnd)
				889	al = newattr(attid, nil, al);
				890	goto attrloop_continue;
				891	}
				892	//# c is '=' here; skip whitespace
				893	while(1) {
				894	c = getchar(ts);
				895	if(c < 0)
				896	goto eob_done;
				897	if(c >= 256 \|\| !isspace(c))
				898	break;
				899	}
				900	quote = 0;
				901	if(c == '\'' \|\| c == '"') {
				902	quote = c;
				903	c = getchar(ts);
				904	if(c < 0)
				905	goto eob_done;
				906	}
				907	val = nil;
				908	nv = 0;
				909	while(1) {
				910	valloop_continue:
				911	if(c < 0)
				912	goto eob_done;
				913	if(c == '>') {
				914	if(quote) {
				915	// c might be part of string (though not good style)
				916	// but if line ends before close quote, assume
				917	// there was an unmatched quote
				918	ti = ts->i;
				919	while(1) {
				920	c = getchar(ts);
				921	if(c < 0)
				922	goto eob_done;
				923	if(c == quote) {
				924	backup(ts, ti);
				925	buf[nv++] = '>';
				926	if(nv == BIGBUFSIZE-1) {
				927	val = buftostr(val, buf, nv);
				928	nv = 0;
				929	}
				930	c = getchar(ts);
				931	goto valloop_continue;
				932	}
				933	if(c == '\n') {
				934	if(warn)
				935	fprint(2, "warning: apparent unmatched quote\n");
				936	backup(ts, ti);
				937	c = '>';
				938	goto valloop_done;
				939	}
				940	}
				941	}
				942	else
				943	goto valloop_done;
				944	}
				945	if(quote) {
				946	if(c == quote) {
				947	c = getchar(ts);
				948	if(c < 0)
				949	goto eob_done;
				950	goto valloop_done;
				951	}
				952	if(c == '\r') {
				953	c = getchar(ts);
				954	goto valloop_continue;
				955	}
				956	if(c == '\t' \|\| c == '\n')
				957	c = ' ';
				958	}
				959	else {
				960	if(c < 256 && isspace(c))
				961	goto valloop_done;
				962	}
				963	if(c == '&') {
				964	c = ampersand(ts);
				965	if(c == -1)
				966	goto eob_done;
				967	}
				968	buf[nv++] = c;
				969	if(nv == BIGBUFSIZE-1) {
				970	val = buftostr(val, buf, nv);
				971	nv = 0;
				972	}
				973	c = getchar(ts);
				974	}
				975	valloop_done:
				976	if(afnd) {
				977	val = buftostr(val, buf, nv);
				978	al = newattr(attid, val, al);
				979	}
				980	}
				981
				982	attrloop_done:
				983	tok->attr = al;
				984	(*pai)++;
				985	return tok->tag;
				986
				987	eob_done:
				988	if(warn)
				989	fprint(2, "warning: incomplete tag at end of page\n");
				990	backup(ts, nexti);
				991	tok->tag = Data;
				992	tok->text = _Strdup(L(Llt));
				993	return Data;
				994	}
				995
				996	// We've just read a '<!' at position starti,
				997	// so this may be a comment or other ignored section, or it may
				998	// be just a literal string if there is no close before end of file
				999	// (other browsers do that).
				1000	// The accepted practice seems to be (note: contrary to SGML spec!):
				1001	// If see <!--, look for --> to close, or if none, > to close.
				1002	// If see <!(not --), look for > to close.
				1003	// If no close before end of file, leave original characters in as literal data.
				1004	//
				1005	// If we see ignorable stuff, return Comment.
				1006	// Else return nil (caller should back up and try again when more data arrives,
				1007	// unless at end of file, in which case caller should just make '<' a data token).
				1008	static int
				1009	comment(TokenSource* ts)
				1010	{
				1011	int nexti;
				1012	int havecomment;
				1013	int c;
				1014
				1015	nexti = ts->i;
				1016	havecomment = 0;
				1017	c = getchar(ts);
				1018	if(c == '-') {
				1019	c = getchar(ts);
				1020	if(c == '-') {
				1021	if(findstr(ts, L(Larrow)))
				1022	havecomment = 1;
				1023	else
				1024	backup(ts, nexti);
				1025	}
				1026	}
				1027	if(!havecomment) {
				1028	if(c == '>')
				1029	havecomment = 1;
				1030	else if(c >= 0) {
				1031	if(findstr(ts, L(Lgt)))
				1032	havecomment = 1;
				1033	}
				1034	}
				1035	if(havecomment)
				1036	return Comment;
				1037	return -1;
				1038	}
				1039
				1040	// Look for string s in token source.
				1041	// If found, return 1, with buffer at next char after s,
				1042	// else return 0 (caller should back up).
				1043	static int
				1044	findstr(TokenSource* ts, Rune* s)
				1045	{
				1046	int c0;
				1047	int n;
				1048	int nexti;
				1049	int i;
				1050	int c;
				1051
				1052	c0 = s[0];
				1053	n = runestrlen(s);
				1054	while(1) {
				1055	c = getchar(ts);
				1056	if(c < 0)
				1057	break;
				1058	if(c == c0) {
				1059	if(n == 1)
				1060	return 1;
				1061	nexti = ts->i;
				1062	for(i = 1; i < n; i++) {
				1063	c = getchar(ts);
				1064	if(c < 0)
				1065	goto mainloop_done;
				1066	if(c != s[i])
				1067	break;
				1068	}
				1069	if(i == n)
				1070	return 1;
				1071	backup(ts, nexti);
				1072	}
				1073	}
				1074	mainloop_done:
				1075	return 0;
				1076	}
				1077
				1078	// We've just read an '&'; look for an entity reference
				1079	// name, and if found, return translated char.
				1080	// if there is a complete entity name but it isn't known,
				1081	// try prefixes (gets around some buggy HTML out there),
				1082	// and if that fails, back up to just past the '&' and return '&'.
				1083	// If the entity can't be completed in the current buffer, back up
				1084	// to the '&' and return -1.
				1085	static int
				1086	ampersand(TokenSource* ts)
				1087	{
				1088	int savei;
				1089	int c;
				1090	int fnd;
				1091	int ans;
				1092	int v;
				1093	int i;
				1094	int k;
				1095	Rune buf[SMALLBUFSIZE];
				1096
				1097	savei = ts->i;
				1098	c = getchar(ts);
				1099	fnd = 0;
				1100	ans = -1;
				1101	if(c == '#') {
				1102	c = getchar(ts);
				1103	v = 0;
				1104	while(c >= 0) {
				1105	if(!(c < 256 && isdigit(c)))
				1106	break;
				1107	v = v*10 + c - 48;
				1108	c = getchar(ts);
				1109	}
				1110	if(c >= 0) {
				1111	if(!(c == ';' \|\| c == '\n' \|\| c == '\r'))
				1112	ungetchar(ts, c);
				1113	c = v;
				1114	if(c == 160)
				1115	c = 160;
				1116	if(c >= Winstart && c <= Winend) {
				1117	c = winchars[c - Winstart];
				1118	}
				1119	ans = c;
				1120	fnd = 1;
				1121	}
				1122	}
				1123	else if(c < 256 && isalpha(c)) {
				1124	buf[0] = c;
				1125	k = 1;
				1126	while(1) {
				1127	c = getchar(ts);
				1128	if(c < 0)
				1129	break;
				1130	if(ISNAMCHAR(c)) {
				1131	if(k < SMALLBUFSIZE-1)
				1132	buf[k++] = c;
				1133	}
				1134	else {
				1135	if(!(c == ';' \|\| c == '\n' \|\| c == '\r'))
				1136	ungetchar(ts, c);
				1137	break;
				1138	}
				1139	}
				1140	if(c >= 0) {
				1141	fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
				1142	if(!fnd) {
				1143	// Try prefixes of s
				1144	if(c == ';' \|\| c == '\n' \|\| c == '\r')
				1145	ungetchar(ts, c);
				1146	i = k;
				1147	while(--k > 0) {
				1148	fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
				1149	if(fnd) {
				1150	while(i > k) {
				1151	i--;
				1152	ungetchar(ts, buf[i]);
				1153	}
				1154	break;
				1155	}
				1156	}
				1157	}
				1158	}
				1159	}
				1160	if(!fnd) {
				1161	backup(ts, savei);
				1162	ans = '&';
				1163	}
				1164	return ans;
				1165	}
				1166
				1167	// Get next char, obeying ts.chset.
				1168	// Returns -1 if no complete character left before current end of data.
				1169	static int
				1170	getchar(TokenSource* ts)
				1171	{
				1172	uchar* buf;
				1173	int c;
				1174	int n;
				1175	int ok;
				1176	Rune r;
				1177
				1178	if(ts->i >= ts->edata)
				1179	return -1;
				1180	buf = ts->data;
				1181	c = buf[ts->i];
				1182	switch(ts->chset) {
				1183	case ISO_8859_1:
				1184	if(c >= Winstart && c <= Winend)
				1185	c = winchars[c - Winstart];
				1186	ts->i++;
				1187	break;
				1188	case US_Ascii:
				1189	if(c > 127) {
				1190	if(warn)
				1191	fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
				1192	}
				1193	ts->i++;
				1194	break;
				1195	case UTF_8:
				1196	ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
				1197	n = chartorune(&r, (char*)(buf+ts->i));
				1198	if(ok) {
				1199	if(warn && c == 0x80)
				1200	fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
				1201	ts->i += n;
				1202	c = r;
				1203	}
				1204	else {
				1205	// not enough bytes in buf to complete utf-8 char
				1206	ts->i = ts->edata; // mark "all used"
				1207	c = -1;
				1208	}
				1209	break;
				1210	case Unicode:
				1211	if(ts->i < ts->edata - 1) {
				1212	//standards say most-significant byte first
				1213	c = (c << 8)\|(buf[ts->i + 1]);
				1214	ts->i += 2;
				1215	}
				1216	else {
				1217	ts->i = ts->edata; // mark "all used"
				1218	c = -1;
				1219	}
				1220	break;
				1221	}
				1222	return c;
				1223	}
				1224
				1225	// Assuming c was the last character returned by getchar, set
				1226	// things up so that next getchar will get that same character
				1227	// followed by the current 'next character', etc.
				1228	static void
				1229	ungetchar(TokenSource* ts, int c)
				1230	{
				1231	int n;
				1232	Rune r;
				1233	char a[UTFmax];
				1234
				1235	n = 1;
				1236	switch(ts->chset) {
				1237	case UTF_8:
				1238	if(c >= 128) {
				1239	r = c;
				1240	n = runetochar(a, &r);
				1241	}
				1242	break;
				1243	case Unicode:
				1244	n = 2;
				1245	break;
				1246	}
				1247	ts->i -= n;
				1248	}
				1249
				1250	// Restore ts so that it is at the state where the index was savei.
				1251	static void
				1252	backup(TokenSource* ts, int savei)
				1253	{
				1254	if(dbglex)
				1255	fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
				1256	ts->i = savei;
				1257	}
				1258
				1259
				1260	// Look for value associated with attribute attid in token t.
				1261	// If there is one, return 1 and put the value in *pans,
				1262	// else return 0.
				1263	// If xfer is true, transfer ownership of the string to the caller
				1264	// (nil it out here); otherwise, caller must duplicate the answer
				1265	// if it needs to save it.
				1266	// OK to have pans==0, in which case this is just looking
				1267	// to see if token is present.
				1268	int
				1269	_tokaval(Token* t, int attid, Rune** pans, int xfer)
				1270	{
				1271	Attr* attr;
				1272
				1273	attr = t->attr;
				1274	while(attr != nil) {
				1275	if(attr->attid == attid) {
				1276	if(pans != nil)
				1277	*pans = attr->value;
				1278	if(xfer)
				1279	attr->value = nil;
				1280	return 1;
				1281	}
				1282	attr = attr->next;
				1283	}
				1284	if(pans != nil)
				1285	*pans = nil;
				1286	return 0;
				1287	}
				1288
				1289	static int
				1290	Tconv(Fmt *f)
				1291	{
				1292	Token* t;
				1293	int i;
				1294	int tag;
				1295	char* srbra;
				1296	Rune* aname;
				1297	Rune* tname;
				1298	Attr* a;
				1299	char buf[BIGBUFSIZE];
				1300
				1301	t = va_arg(f->args, Token*);
				1302	if(t == nil)
				1303	sprint(buf, "<null>");
				1304	else {
				1305	i = 0;
				1306	if(dbglex > 1)
				1307	i = snprint(buf, sizeof(buf), "[%d]", t->starti);
				1308	tag = t->tag;
				1309	if(tag == Data) {
				1310	i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
				1311	}
				1312	else {
				1313	srbra = "";
				1314	if(tag >= RBRA) {
				1315	tag -= RBRA;
				1316	srbra = "/";
				1317	}
				1318	tname = tagnames[tag];
				1319	if(tag == Notfound)
				1320	tname = L(Lquestion);
				1321	i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
				1322	for(a = t->attr; a != nil; a = a->next) {
				1323	aname = attrnames[a->attid];
				1324	i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
				1325	if(a->value != nil)
				1326	i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
				1327	}
				1328	i += snprint(buf+i, sizeof(buf)-i-1, ">");
				1329	}
				1330	buf[i] = 0;
				1331	}
				1332	return fmtstrcpy(f, buf);
				1333	}
				1334
				1335	// Attrs own their constituent strings, but build may eventually
				1336	// transfer some values to its items and nil them out in the Attr.
				1337	static Attr*
				1338	newattr(int attid, Rune* value, Attr* link)
				1339	{
				1340	Attr* ans;
				1341
				1342	ans = (Attr*)emalloc(sizeof(Attr));
				1343	ans->attid = attid;
				1344	ans->value = value;
				1345	ans->next = link;
				1346	return ans;
				1347	}
				1348
				1349	// Free list of Attrs linked through next field
				1350	static void
				1351	freeattrs(Attr* ahead)
				1352	{
				1353	Attr* a;
				1354	Attr* nexta;
				1355
				1356	a = ahead;
				1357	while(a != nil) {
				1358	nexta = a->next;
				1359	free(a->value);
				1360	free(a);
				1361	a = nexta;
				1362	}
				1363	}
				1364
				1365	// Free array of Tokens.
				1366	// Allocated space might have room for more than n tokens,
				1367	// but only n of them are initialized.
				1368	// If caller has transferred ownership of constitutent strings
				1369	// or attributes, it must have nil'd out the pointers in the Tokens.
				1370	void
				1371	_freetokens(Token* tarray, int n)
				1372	{
				1373	int i;
				1374	Token* t;
				1375
				1376	if(tarray == nil)
				1377	return;
				1378	for(i = 0; i < n; i++) {
				1379	t = &tarray[i];
				1380	free(t->text);
				1381	freeattrs(t->attr);
				1382	}
				1383	free(tarray);
				1384	}