blob: 9a47dde8c6d67afcde5d9fcd1b26909ace0f5aed [file] [log] [blame]
wkj7cf289c2004-04-06 19:06:52 +00001#include <u.h>
2#include <libc.h>
3#include <draw.h>
4#include <ctype.h>
5#include <html.h>
6#include "impl.h"
7
8typedef struct TokenSource TokenSource;
9struct TokenSource
10{
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
16};
17
18enum {
19 EOF = -2,
20 EOB = -1
21};
22
23#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24
25#define SMALLBUFSIZE 240
26#define BIGBUFSIZE 2000
27
28// HTML 4.0 tag names.
29// Keep sorted, and in correspondence with enum in iparse.h.
30Rune **tagnames;
31char *_tagnames[] = {
32 " ",
33 "!",
34 "a",
35 "abbr",
36 "acronym",
37 "address",
38 "applet",
39 "area",
40 "b",
41 "base",
42 "basefont",
43 "bdo",
44 "big",
45 "blink",
46 "blockquote",
47 "body",
48 "bq",
49 "br",
50 "button",
51 "caption",
52 "center",
53 "cite",
54 "code",
55 "col",
56 "colgroup",
57 "dd",
58 "del",
59 "dfn",
60 "dir",
61 "div",
62 "dl",
63 "dt",
64 "em",
65 "fieldset",
66 "font",
67 "form",
68 "frame",
69 "frameset",
70 "h1",
71 "h2",
72 "h3",
73 "h4",
74 "h5",
75 "h6",
76 "head",
77 "hr",
78 "html",
79 "i",
80 "iframe",
81 "img",
82 "input",
83 "ins",
84 "isindex",
85 "kbd",
86 "label",
87 "legend",
88 "li",
89 "link",
90 "map",
91 "menu",
92 "meta",
93 "nobr",
94 "noframes",
95 "noscript",
96 "object",
97 "ol",
98 "optgroup",
99 "option",
100 "p",
101 "param",
102 "pre",
103 "q",
104 "s",
105 "samp",
106 "script",
107 "select",
108 "small",
109 "span",
110 "strike",
111 "strong",
112 "style",
113 "sub",
114 "sup",
115 "table",
116 "tbody",
117 "td",
118 "textarea",
119 "tfoot",
120 "th",
121 "thead",
122 "title",
123 "tr",
124 "tt",
125 "u",
126 "ul",
127 "var"
128};
129
130// HTML 4.0 attribute names.
131// Keep sorted, and in correspondence with enum in i.h.
132Rune **attrnames;
133char* _attrnames[] = {
134 "abbr",
135 "accept-charset",
136 "access-key",
137 "action",
138 "align",
139 "alink",
140 "alt",
141 "archive",
142 "axis",
143 "background",
144 "bgcolor",
145 "border",
146 "cellpadding",
147 "cellspacing",
148 "char",
149 "charoff",
150 "charset",
151 "checked",
152 "cite",
153 "class",
154 "classid",
155 "clear",
156 "code",
157 "codebase",
158 "codetype",
159 "color",
160 "cols",
161 "colspan",
162 "compact",
163 "content",
164 "coords",
165 "data",
166 "datetime",
167 "declare",
168 "defer",
169 "dir",
170 "disabled",
171 "enctype",
172 "face",
173 "for",
174 "frame",
175 "frameborder",
176 "headers",
177 "height",
178 "href",
179 "hreflang",
180 "hspace",
181 "http-equiv",
182 "id",
183 "ismap",
184 "label",
185 "lang",
186 "link",
187 "longdesc",
188 "marginheight",
189 "marginwidth",
190 "maxlength",
191 "media",
192 "method",
193 "multiple",
194 "name",
195 "nohref",
196 "noresize",
197 "noshade",
198 "nowrap",
199 "object",
200 "onblur",
201 "onchange",
202 "onclick",
203 "ondblclick",
204 "onfocus",
205 "onkeypress",
206 "onkeyup",
207 "onload",
208 "onmousedown",
209 "onmousemove",
210 "onmouseout",
211 "onmouseover",
212 "onmouseup",
213 "onreset",
214 "onselect",
215 "onsubmit",
216 "onunload",
217 "profile",
218 "prompt",
219 "readonly",
220 "rel",
221 "rev",
222 "rows",
223 "rowspan",
224 "rules",
225 "scheme",
226 "scope",
227 "scrolling",
228 "selected",
229 "shape",
230 "size",
231 "span",
232 "src",
233 "standby",
234 "start",
235 "style",
236 "summary",
237 "tabindex",
238 "target",
239 "text",
240 "title",
241 "type",
242 "usemap",
243 "valign",
244 "value",
245 "valuetype",
246 "version",
247 "vlink",
248 "vspace",
249 "width"
250};
251
252
253// Character entity to unicode character number map.
254// Keep sorted by name.
255StringInt *chartab;
256AsciiInt _chartab[142] = {
257 {"AElig", 198},
258 {"Aacute", 193},
259 {"Acirc", 194},
260 {"Agrave", 192},
261 {"Aring", 197},
262 {"Atilde", 195},
263 {"Auml", 196},
264 {"Ccedil", 199},
265 {"ETH", 208},
266 {"Eacute", 201},
267 {"Ecirc", 202},
268 {"Egrave", 200},
269 {"Euml", 203},
270 {"Iacute", 205},
271 {"Icirc", 206},
272 {"Igrave", 204},
273 {"Iuml", 207},
274 {"Ntilde", 209},
275 {"Oacute", 211},
276 {"Ocirc", 212},
277 {"Ograve", 210},
278 {"Oslash", 216},
279 {"Otilde", 213},
280 {"Ouml", 214},
281 {"THORN", 222},
282 {"Uacute", 218},
283 {"Ucirc", 219},
284 {"Ugrave", 217},
285 {"Uuml", 220},
286 {"Yacute", 221},
287 {"aacute", 225},
288 {"acirc", 226},
289 {"acute", 180},
290 {"aelig", 230},
291 {"agrave", 224},
292 {"alpha", 945},
293 {"amp", 38},
294 {"aring", 229},
295 {"atilde", 227},
296 {"auml", 228},
297 {"beta", 946},
298 {"brvbar", 166},
299 {"ccedil", 231},
300 {"cdots", 8943},
301 {"cedil", 184},
302 {"cent", 162},
303 {"chi", 967},
304 {"copy", 169},
305 {"curren", 164},
306 {"ddots", 8945},
307 {"deg", 176},
308 {"delta", 948},
309 {"divide", 247},
310 {"eacute", 233},
311 {"ecirc", 234},
312 {"egrave", 232},
313 {"emdash", 8212},
314 {"emsp", 8195},
315 {"endash", 8211},
316 {"ensp", 8194},
317 {"epsilon", 949},
318 {"eta", 951},
319 {"eth", 240},
320 {"euml", 235},
321 {"frac12", 189},
322 {"frac14", 188},
323 {"frac34", 190},
324 {"gamma", 947},
325 {"gt", 62},
326 {"iacute", 237},
327 {"icirc", 238},
328 {"iexcl", 161},
329 {"igrave", 236},
330 {"iota", 953},
331 {"iquest", 191},
332 {"iuml", 239},
333 {"kappa", 954},
334 {"lambda", 955},
335 {"laquo", 171},
336 {"ldots", 8230},
337 {"lt", 60},
338 {"macr", 175},
339 {"micro", 181},
340 {"middot", 183},
341 {"mu", 956},
342 {"nbsp", 160},
343 {"not", 172},
344 {"ntilde", 241},
345 {"nu", 957},
346 {"oacute", 243},
347 {"ocirc", 244},
348 {"ograve", 242},
349 {"omega", 969},
350 {"omicron", 959},
351 {"ordf", 170},
352 {"ordm", 186},
353 {"oslash", 248},
354 {"otilde", 245},
355 {"ouml", 246},
356 {"para", 182},
357 {"phi", 966},
358 {"pi", 960},
359 {"plusmn", 177},
360 {"pound", 163},
361 {"psi", 968},
362 {"quad", 8193},
363 {"quot", 34},
364 {"raquo", 187},
365 {"reg", 174},
366 {"rho", 961},
367 {"sect", 167},
368 {"shy", 173},
369 {"sigma", 963},
370 {"sp", 8194},
371 {"sup1", 185},
372 {"sup2", 178},
373 {"sup3", 179},
374 {"szlig", 223},
375 {"tau", 964},
376 {"theta", 952},
377 {"thinsp", 8201},
378 {"thorn", 254},
379 {"times", 215},
380 {"trade", 8482},
381 {"uacute", 250},
382 {"ucirc", 251},
383 {"ugrave", 249},
384 {"uml", 168},
385 {"upsilon", 965},
386 {"uuml", 252},
387 {"varepsilon", 8712},
388 {"varphi", 981},
389 {"varpi", 982},
390 {"varrho", 1009},
391 {"vdots", 8942},
392 {"vsigma", 962},
393 {"vtheta", 977},
394 {"xi", 958},
395 {"yacute", 253},
396 {"yen", 165},
397 {"yuml", 255},
398 {"zeta", 950}
399};
400#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
401
402// Characters Winstart..Winend are those that Windows
403// uses interpolated into the Latin1 set.
404// They aren't supposed to appear in HTML, but they do....
405enum {
406 Winstart = 127,
407 Winend = 159
408};
409
410static int winchars[]= { 8226, // 8226 is a bullet
411 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
412 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
413 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
414 732, 8482, 353, 8250, 339, 8226, 8226, 376};
415
416static StringInt* tagtable; // initialized from tagnames
417static StringInt* attrtable; // initialized from attrnames
418
419static void lexinit();
420static int getplaindata(TokenSource* ts, Token* a, int* pai);
421static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
422static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
423static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
424static Rune* buftostr(Rune* s, Rune* buf, int j);
425static int comment(TokenSource* ts);
426static int findstr(TokenSource* ts, Rune* s);
427static int ampersand(TokenSource* ts);
428//static int lowerc(int c);
429static int getchar(TokenSource* ts);
430static void ungetchar(TokenSource* ts, int c);
431static void backup(TokenSource* ts, int savei);
432//static void freeinsidetoken(Token* t);
433static void freeattrs(Attr* ahead);
434static Attr* newattr(int attid, Rune* value, Attr* link);
435static int Tconv(Fmt* f);
436
437int dbglex = 0;
438static int lexinited = 0;
439
440static void
441lexinit(void)
442{
rsc7e195612005-01-04 22:20:21 +0000443 chartab = _cvtstringinttab(_chartab, nelem(_chartab));
444 tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
wkj7cf289c2004-04-06 19:06:52 +0000445 tagtable = _makestrinttab(tagnames, Numtags);
rsc7e195612005-01-04 22:20:21 +0000446 attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
wkj7cf289c2004-04-06 19:06:52 +0000447 attrtable = _makestrinttab(attrnames, Numattrs);
448 fmtinstall('T', Tconv);
449 lexinited = 1;
450}
451
452static TokenSource*
453newtokensource(uchar* data, int edata, int chset, int mtype)
454{
455 TokenSource* ans;
456
457 assert(chset == US_Ascii || chset == ISO_8859_1 ||
458 chset == UTF_8 || chset == Unicode);
459 ans = (TokenSource*)emalloc(sizeof(TokenSource));
460 ans->i = 0;
461 ans->data = data;
462 ans->edata = edata;
463 ans->chset = chset;
464 ans->mtype = mtype;
465 return ans;
466}
467
468enum {
469 ToksChunk = 500
470};
471
472// Call this to get the tokens.
473// The number of returned tokens is returned in *plen.
474Token*
475_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
476{
477 TokenSource* ts;
478 Token* a;
479 int alen;
480 int ai;
481 int starti;
482 int c;
483 int tag;
484
485 if(!lexinited)
486 lexinit();
487 ts = newtokensource(data, datalen, chset, mtype);
488 alen = ToksChunk;
489 a = (Token*)emalloc(alen * sizeof(Token));
490 ai = 0;
491 if(dbglex)
492 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
493 if(ts->mtype == TextHtml) {
494 for(;;) {
495 if(ai == alen) {
496 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
497 alen += ToksChunk;
498 }
499 starti = ts->i;
500 c = getchar(ts);
501 if(c < 0)
502 break;
503 if(c == '<') {
504 tag = gettag(ts, starti, a, &ai);
505 if(tag == Tscript) {
506 // special rules for getting Data after....
507 starti = ts->i;
508 c = getchar(ts);
509 tag = getscriptdata(ts, c, starti, a, &ai);
510 }
511 }
512 else
513 tag = getdata(ts, c, starti, a, &ai);
514 if(tag == -1)
515 break;
516 else if(dbglex > 1 && tag != Comment)
517 fprint(2, "lex: got token %T\n", &a[ai-1]);
518 }
519 }
520 else {
521 // plain text (non-html) tokens
522 for(;;) {
523 if(ai == alen) {
524 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
525 alen += ToksChunk;
526 }
527 tag = getplaindata(ts, a, &ai);
528 if(tag == -1)
529 break;
530 if(dbglex > 1)
531 fprint(2, "lex: got token %T\n", &a[ai]);
532 }
533 }
534 if(dbglex)
535 fprint(2, "lex: returning %d tokens\n", ai);
536 *plen = ai;
537 if(ai == 0)
538 return nil;
539 return a;
540}
541
542// For case where source isn't HTML.
543// Just make data tokens, one per line (or partial line,
544// at end of buffer), ignoring non-whitespace control
545// characters and dumping \r's.
546// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
547// Otherwise return -1;
548static int
549getplaindata(TokenSource* ts, Token* a, int* pai)
550{
551 Rune* s;
552 int j;
553 int starti;
554 int c;
555 Token* tok;
556 Rune buf[BIGBUFSIZE];
557
558 s = nil;
559 j = 0;
560 starti = ts->i;
561 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
562 if(c < ' ') {
563 if(isspace(c)) {
564 if(c == '\r') {
565 // ignore it unless no following '\n',
566 // in which case treat it like '\n'
567 c = getchar(ts);
568 if(c != '\n') {
569 if(c >= 0)
570 ungetchar(ts, c);
571 c = '\n';
572 }
573 }
574 }
575 else
576 c = 0;
577 }
578 if(c != 0) {
579 buf[j++] = c;
580 if(j == sizeof(buf)-1) {
581 s = buftostr(s, buf, j);
582 j = 0;
583 }
584 }
585 if(c == '\n')
586 break;
587 }
588 s = buftostr(s, buf, j);
589 if(s == nil)
590 return -1;
591 tok = &a[(*pai)++];
592 tok->tag = Data;
593 tok->text = s;
594 tok->attr = nil;
595 tok->starti = starti;
596 return Data;
597}
598
599// Return concatenation of s and buf[0:j]
600static Rune*
601buftostr(Rune* s, Rune* buf, int j)
602{
603 buf[j] = 0;
604 if(s == nil)
605 s = _Strndup(buf, j);
606 else
607 s = _Strdup2(s, buf);
608 return s;
609}
610
611// Gather data up to next start-of-tag or end-of-buffer.
612// Translate entity references (&amp;).
613// Ignore non-whitespace control characters and get rid of \r's.
614// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
615// Otherwise return -1;
616static int
617getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
618{
619 Rune* s;
620 int j;
621 int c;
622 Token* tok;
623 Rune buf[BIGBUFSIZE];
624
625 s = nil;
626 j = 0;
627 c = firstc;
628 while(c >= 0) {
629 if(c == '&') {
630 c = ampersand(ts);
631 if(c < 0)
632 break;
633 }
634 else if(c < ' ') {
635 if(isspace(c)) {
636 if(c == '\r') {
637 // ignore it unless no following '\n',
638 // in which case treat it like '\n'
639 c = getchar(ts);
640 if(c != '\n') {
641 if(c >= 0)
642 ungetchar(ts, c);
643 c = '\n';
644 }
645 }
646 }
647 else {
648 if(warn)
649 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
650 c = 0;
651 }
652 }
653 else if(c == '<') {
654 ungetchar(ts, c);
655 break;
656 }
657 if(c != 0) {
658 buf[j++] = c;
659 if(j == BIGBUFSIZE-1) {
660 s = buftostr(s, buf, j);
661 j = 0;
662 }
663 }
664 c = getchar(ts);
665 }
666 s = buftostr(s, buf, j);
667 if(s == nil)
668 return -1;
669 tok = &a[(*pai)++];
670 tok->tag = Data;
671 tok->text = s;
672 tok->attr = nil;
673 tok->starti = starti;
674 return Data;
675}
676
677// The rules for lexing scripts are different (ugh).
678// Gather up everything until see a </SCRIPT>.
679static int
680getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
681{
682 Rune* s;
683 int j;
684 int tstarti;
685 int savei;
686 int c;
687 int tag;
688 int done;
689 Token* tok;
690 Rune buf[BIGBUFSIZE];
691
692 s = nil;
693 j = 0;
694 tstarti = starti;
695 c = firstc;
696 done = 0;
697 while(c >= 0) {
698 if(c == '<') {
699 // other browsers ignore stuff to end of line after <!
700 savei = ts->i;
701 c = getchar(ts);
702 if(c == '!') {
703 while(c >= 0 && c != '\n' && c != '\r')
704 c = getchar(ts);
705 if(c == '\r')
706 c = getchar(ts);
707 if(c == '\n')
708 c = getchar(ts);
709 }
710 else if(c >= 0) {
711 backup(ts, savei);
712 tag = gettag(ts, tstarti, a, pai);
713 if(tag == -1)
714 break;
715 if(tag != Comment)
716 (*pai)--;
717 backup(ts, tstarti);
718 if(tag == Tscript + RBRA) {
719 done = 1;
720 break;
721 }
722 // here tag was not </SCRIPT>, so take as regular data
723 c = getchar(ts);
724 }
725 }
726 if(c < 0)
727 break;
728 if(c != 0) {
729 buf[j++] = c;
730 if(j == BIGBUFSIZE-1) {
731 s = buftostr(s, buf, j);
732 j = 0;
733 }
734 }
735 tstarti = ts->i;
736 c = getchar(ts);
737 }
738 if(done || ts->i == ts->edata) {
739 s = buftostr(s, buf, j);
740 tok = &a[(*pai)++];
741 tok->tag = Data;
742 tok->text = s;
743 tok->attr = nil;
744 tok->starti = starti;
745 return Data;
746 }
747 backup(ts, starti);
748 return -1;
749}
750
751// We've just seen a '<'. Gather up stuff to closing '>' (if buffer
752// ends before then, return -1).
753// If it's a tag, look up the name, gather the attributes, and return
754// the appropriate token.
755// Else it's either just plain data or some kind of ignorable stuff:
756// return Data or Comment as appropriate.
757// If it's not a Comment, put it in a[*pai] and bump *pai.
758static int
759gettag(TokenSource* ts, int starti, Token* a, int* pai)
760{
761 int rbra;
762 int ans;
763 Attr* al;
764 int nexti;
765 int c;
766 int ti;
767 int afnd;
768 int attid;
769 int quote;
770 Rune* val;
771 int nv;
772 int i;
773 int tag;
774 Token* tok;
775 Rune buf[BIGBUFSIZE];
776
777 rbra = 0;
778 nexti = ts->i;
779 tok = &a[*pai];
780 tok->tag = Notfound;
781 tok->text = nil;
782 tok->attr = nil;
783 tok->starti = starti;
784 c = getchar(ts);
785 if(c == '/') {
786 rbra = RBRA;
787 c = getchar(ts);
788 }
789 if(c < 0)
790 goto eob_done;
791 if(c >= 256 || !isalpha(c)) {
792 // not a tag
793 if(c == '!') {
794 ans = comment(ts);
795 if(ans != -1)
796 return ans;
797 goto eob_done;
798 }
799 else {
800 backup(ts, nexti);
801 tok->tag = Data;
802 tok->text = _Strdup(L(Llt));
803 (*pai)++;
804 return Data;
805 }
806 }
807 // c starts a tagname
808 buf[0] = c;
809 i = 1;
810 while(1) {
811 c = getchar(ts);
812 if(c < 0)
813 goto eob_done;
814 if(!ISNAMCHAR(c))
815 break;
816 // if name is bigger than buf it won't be found anyway...
817 if(i < BIGBUFSIZE)
818 buf[i++] = c;
819 }
820 if(_lookup(tagtable, Numtags, buf, i, &tag))
821 tok->tag = tag + rbra;
822 else
823 tok->text = _Strndup(buf, i); // for warning print, in build
824
825 // attribute gathering loop
826 al = nil;
827 while(1) {
828 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
829 // skip whitespace
830attrloop_continue:
831 while(c < 256 && isspace(c)) {
832 c = getchar(ts);
833 if(c < 0)
834 goto eob_done;
835 }
836 if(c == '>')
837 goto attrloop_done;
838 if(c == '<') {
839 if(warn)
840 fprint(2, "warning: unclosed tag\n");
841 ungetchar(ts, c);
842 goto attrloop_done;
843 }
844 if(c >= 256 || !isalpha(c)) {
845 if(warn)
846 fprint(2, "warning: expected attribute name\n");
847 // skipt to next attribute name
848 while(1) {
849 c = getchar(ts);
850 if(c < 0)
851 goto eob_done;
852 if(c < 256 && isalpha(c))
853 goto attrloop_continue;
854 if(c == '<') {
855 if(warn)
856 fprint(2, "warning: unclosed tag\n");
857 ungetchar(ts, 60);
858 goto attrloop_done;
859 }
860 if(c == '>')
861 goto attrloop_done;
862 }
863 }
864 // gather attribute name
865 buf[0] = c;
866 i = 1;
867 while(1) {
868 c = getchar(ts);
869 if(c < 0)
870 goto eob_done;
871 if(!ISNAMCHAR(c))
872 break;
873 if(i < BIGBUFSIZE-1)
874 buf[i++] = c;
875 }
876 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
877 if(warn && !afnd) {
878 buf[i] = 0;
879 fprint(2, "warning: unknown attribute name %S\n", buf);
880 }
881 // skip whitespace
882 while(c < 256 && isspace(c)) {
883 c = getchar(ts);
884 if(c < 0)
885 goto eob_done;
886 }
887 if(c != '=') {
888 if(afnd)
889 al = newattr(attid, nil, al);
890 goto attrloop_continue;
891 }
892 //# c is '=' here; skip whitespace
893 while(1) {
894 c = getchar(ts);
895 if(c < 0)
896 goto eob_done;
897 if(c >= 256 || !isspace(c))
898 break;
899 }
900 quote = 0;
901 if(c == '\'' || c == '"') {
902 quote = c;
903 c = getchar(ts);
904 if(c < 0)
905 goto eob_done;
906 }
907 val = nil;
908 nv = 0;
909 while(1) {
910valloop_continue:
911 if(c < 0)
912 goto eob_done;
913 if(c == '>') {
914 if(quote) {
915 // c might be part of string (though not good style)
916 // but if line ends before close quote, assume
917 // there was an unmatched quote
918 ti = ts->i;
919 while(1) {
920 c = getchar(ts);
921 if(c < 0)
922 goto eob_done;
923 if(c == quote) {
924 backup(ts, ti);
925 buf[nv++] = '>';
926 if(nv == BIGBUFSIZE-1) {
927 val = buftostr(val, buf, nv);
928 nv = 0;
929 }
930 c = getchar(ts);
931 goto valloop_continue;
932 }
933 if(c == '\n') {
934 if(warn)
935 fprint(2, "warning: apparent unmatched quote\n");
936 backup(ts, ti);
937 c = '>';
938 goto valloop_done;
939 }
940 }
941 }
942 else
943 goto valloop_done;
944 }
945 if(quote) {
946 if(c == quote) {
947 c = getchar(ts);
948 if(c < 0)
949 goto eob_done;
950 goto valloop_done;
951 }
952 if(c == '\r') {
953 c = getchar(ts);
954 goto valloop_continue;
955 }
956 if(c == '\t' || c == '\n')
957 c = ' ';
958 }
959 else {
960 if(c < 256 && isspace(c))
961 goto valloop_done;
962 }
963 if(c == '&') {
964 c = ampersand(ts);
965 if(c == -1)
966 goto eob_done;
967 }
968 buf[nv++] = c;
969 if(nv == BIGBUFSIZE-1) {
970 val = buftostr(val, buf, nv);
971 nv = 0;
972 }
973 c = getchar(ts);
974 }
975valloop_done:
976 if(afnd) {
977 val = buftostr(val, buf, nv);
978 al = newattr(attid, val, al);
979 }
980 }
981
982attrloop_done:
983 tok->attr = al;
984 (*pai)++;
985 return tok->tag;
986
987eob_done:
988 if(warn)
989 fprint(2, "warning: incomplete tag at end of page\n");
990 backup(ts, nexti);
991 tok->tag = Data;
992 tok->text = _Strdup(L(Llt));
993 return Data;
994}
995
996// We've just read a '<!' at position starti,
997// so this may be a comment or other ignored section, or it may
998// be just a literal string if there is no close before end of file
999// (other browsers do that).
1000// The accepted practice seems to be (note: contrary to SGML spec!):
1001// If see <!--, look for --> to close, or if none, > to close.
1002// If see <!(not --), look for > to close.
1003// If no close before end of file, leave original characters in as literal data.
1004//
1005// If we see ignorable stuff, return Comment.
1006// Else return nil (caller should back up and try again when more data arrives,
1007// unless at end of file, in which case caller should just make '<' a data token).
1008static int
1009comment(TokenSource* ts)
1010{
1011 int nexti;
1012 int havecomment;
1013 int c;
1014
1015 nexti = ts->i;
1016 havecomment = 0;
1017 c = getchar(ts);
1018 if(c == '-') {
1019 c = getchar(ts);
1020 if(c == '-') {
1021 if(findstr(ts, L(Larrow)))
1022 havecomment = 1;
1023 else
1024 backup(ts, nexti);
1025 }
1026 }
1027 if(!havecomment) {
1028 if(c == '>')
1029 havecomment = 1;
1030 else if(c >= 0) {
1031 if(findstr(ts, L(Lgt)))
1032 havecomment = 1;
1033 }
1034 }
1035 if(havecomment)
1036 return Comment;
1037 return -1;
1038}
1039
1040// Look for string s in token source.
1041// If found, return 1, with buffer at next char after s,
1042// else return 0 (caller should back up).
1043static int
1044findstr(TokenSource* ts, Rune* s)
1045{
1046 int c0;
1047 int n;
1048 int nexti;
1049 int i;
1050 int c;
1051
1052 c0 = s[0];
1053 n = runestrlen(s);
1054 while(1) {
1055 c = getchar(ts);
1056 if(c < 0)
1057 break;
1058 if(c == c0) {
1059 if(n == 1)
1060 return 1;
1061 nexti = ts->i;
1062 for(i = 1; i < n; i++) {
1063 c = getchar(ts);
1064 if(c < 0)
1065 goto mainloop_done;
1066 if(c != s[i])
1067 break;
1068 }
1069 if(i == n)
1070 return 1;
1071 backup(ts, nexti);
1072 }
1073 }
1074mainloop_done:
1075 return 0;
1076}
1077
1078// We've just read an '&'; look for an entity reference
1079// name, and if found, return translated char.
1080// if there is a complete entity name but it isn't known,
1081// try prefixes (gets around some buggy HTML out there),
1082// and if that fails, back up to just past the '&' and return '&'.
1083// If the entity can't be completed in the current buffer, back up
1084// to the '&' and return -1.
1085static int
1086ampersand(TokenSource* ts)
1087{
1088 int savei;
1089 int c;
1090 int fnd;
1091 int ans;
1092 int v;
1093 int i;
1094 int k;
1095 Rune buf[SMALLBUFSIZE];
1096
1097 savei = ts->i;
1098 c = getchar(ts);
1099 fnd = 0;
1100 ans = -1;
1101 if(c == '#') {
1102 c = getchar(ts);
1103 v = 0;
1104 while(c >= 0) {
1105 if(!(c < 256 && isdigit(c)))
1106 break;
1107 v = v*10 + c - 48;
1108 c = getchar(ts);
1109 }
1110 if(c >= 0) {
1111 if(!(c == ';' || c == '\n' || c == '\r'))
1112 ungetchar(ts, c);
1113 c = v;
1114 if(c == 160)
1115 c = 160;
1116 if(c >= Winstart && c <= Winend) {
1117 c = winchars[c - Winstart];
1118 }
1119 ans = c;
1120 fnd = 1;
1121 }
1122 }
1123 else if(c < 256 && isalpha(c)) {
1124 buf[0] = c;
1125 k = 1;
1126 while(1) {
1127 c = getchar(ts);
1128 if(c < 0)
1129 break;
1130 if(ISNAMCHAR(c)) {
1131 if(k < SMALLBUFSIZE-1)
1132 buf[k++] = c;
1133 }
1134 else {
1135 if(!(c == ';' || c == '\n' || c == '\r'))
1136 ungetchar(ts, c);
1137 break;
1138 }
1139 }
1140 if(c >= 0) {
1141 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1142 if(!fnd) {
1143 // Try prefixes of s
1144 if(c == ';' || c == '\n' || c == '\r')
1145 ungetchar(ts, c);
1146 i = k;
1147 while(--k > 0) {
1148 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1149 if(fnd) {
1150 while(i > k) {
1151 i--;
1152 ungetchar(ts, buf[i]);
1153 }
1154 break;
1155 }
1156 }
1157 }
1158 }
1159 }
1160 if(!fnd) {
1161 backup(ts, savei);
1162 ans = '&';
1163 }
1164 return ans;
1165}
1166
1167// Get next char, obeying ts.chset.
1168// Returns -1 if no complete character left before current end of data.
1169static int
1170getchar(TokenSource* ts)
1171{
1172 uchar* buf;
1173 int c;
1174 int n;
1175 int ok;
1176 Rune r;
1177
1178 if(ts->i >= ts->edata)
1179 return -1;
1180 buf = ts->data;
1181 c = buf[ts->i];
1182 switch(ts->chset) {
1183 case ISO_8859_1:
1184 if(c >= Winstart && c <= Winend)
1185 c = winchars[c - Winstart];
1186 ts->i++;
1187 break;
1188 case US_Ascii:
1189 if(c > 127) {
1190 if(warn)
1191 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1192 }
1193 ts->i++;
1194 break;
1195 case UTF_8:
1196 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1197 n = chartorune(&r, (char*)(buf+ts->i));
1198 if(ok) {
1199 if(warn && c == 0x80)
1200 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1201 ts->i += n;
1202 c = r;
1203 }
1204 else {
1205 // not enough bytes in buf to complete utf-8 char
1206 ts->i = ts->edata; // mark "all used"
1207 c = -1;
1208 }
1209 break;
1210 case Unicode:
1211 if(ts->i < ts->edata - 1) {
1212 //standards say most-significant byte first
1213 c = (c << 8)|(buf[ts->i + 1]);
1214 ts->i += 2;
1215 }
1216 else {
1217 ts->i = ts->edata; // mark "all used"
1218 c = -1;
1219 }
1220 break;
1221 }
1222 return c;
1223}
1224
1225// Assuming c was the last character returned by getchar, set
1226// things up so that next getchar will get that same character
1227// followed by the current 'next character', etc.
1228static void
1229ungetchar(TokenSource* ts, int c)
1230{
1231 int n;
1232 Rune r;
1233 char a[UTFmax];
1234
1235 n = 1;
1236 switch(ts->chset) {
1237 case UTF_8:
1238 if(c >= 128) {
1239 r = c;
1240 n = runetochar(a, &r);
1241 }
1242 break;
1243 case Unicode:
1244 n = 2;
1245 break;
1246 }
1247 ts->i -= n;
1248}
1249
1250// Restore ts so that it is at the state where the index was savei.
1251static void
1252backup(TokenSource* ts, int savei)
1253{
1254 if(dbglex)
1255 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1256 ts->i = savei;
1257}
1258
1259
1260// Look for value associated with attribute attid in token t.
1261// If there is one, return 1 and put the value in *pans,
1262// else return 0.
1263// If xfer is true, transfer ownership of the string to the caller
1264// (nil it out here); otherwise, caller must duplicate the answer
1265// if it needs to save it.
1266// OK to have pans==0, in which case this is just looking
1267// to see if token is present.
1268int
1269_tokaval(Token* t, int attid, Rune** pans, int xfer)
1270{
1271 Attr* attr;
1272
1273 attr = t->attr;
1274 while(attr != nil) {
1275 if(attr->attid == attid) {
1276 if(pans != nil)
1277 *pans = attr->value;
1278 if(xfer)
1279 attr->value = nil;
1280 return 1;
1281 }
1282 attr = attr->next;
1283 }
1284 if(pans != nil)
1285 *pans = nil;
1286 return 0;
1287}
1288
1289static int
1290Tconv(Fmt *f)
1291{
1292 Token* t;
1293 int i;
1294 int tag;
1295 char* srbra;
1296 Rune* aname;
1297 Rune* tname;
1298 Attr* a;
1299 char buf[BIGBUFSIZE];
1300
1301 t = va_arg(f->args, Token*);
1302 if(t == nil)
1303 sprint(buf, "<null>");
1304 else {
1305 i = 0;
1306 if(dbglex > 1)
1307 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1308 tag = t->tag;
1309 if(tag == Data) {
1310 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1311 }
1312 else {
1313 srbra = "";
1314 if(tag >= RBRA) {
1315 tag -= RBRA;
1316 srbra = "/";
1317 }
1318 tname = tagnames[tag];
1319 if(tag == Notfound)
1320 tname = L(Lquestion);
1321 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1322 for(a = t->attr; a != nil; a = a->next) {
1323 aname = attrnames[a->attid];
1324 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1325 if(a->value != nil)
1326 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1327 }
1328 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1329 }
1330 buf[i] = 0;
1331 }
1332 return fmtstrcpy(f, buf);
1333}
1334
1335// Attrs own their constituent strings, but build may eventually
1336// transfer some values to its items and nil them out in the Attr.
1337static Attr*
1338newattr(int attid, Rune* value, Attr* link)
1339{
1340 Attr* ans;
1341
1342 ans = (Attr*)emalloc(sizeof(Attr));
1343 ans->attid = attid;
1344 ans->value = value;
1345 ans->next = link;
1346 return ans;
1347}
1348
1349// Free list of Attrs linked through next field
1350static void
1351freeattrs(Attr* ahead)
1352{
1353 Attr* a;
1354 Attr* nexta;
1355
1356 a = ahead;
1357 while(a != nil) {
1358 nexta = a->next;
1359 free(a->value);
1360 free(a);
1361 a = nexta;
1362 }
1363}
1364
1365// Free array of Tokens.
1366// Allocated space might have room for more than n tokens,
1367// but only n of them are initialized.
1368// If caller has transferred ownership of constitutent strings
1369// or attributes, it must have nil'd out the pointers in the Tokens.
1370void
1371_freetokens(Token* tarray, int n)
1372{
1373 int i;
1374 Token* t;
1375
1376 if(tarray == nil)
1377 return;
1378 for(i = 0; i < n; i++) {
1379 t = &tarray[i];
1380 free(t->text);
1381 freeattrs(t->attr);
1382 }
1383 free(tarray);
1384}