| #include <u.h> |
| #include <libc.h> |
| #include <draw.h> |
| #include <ctype.h> |
| #include <html.h> |
| #include "impl.h" |
| |
| typedef struct TokenSource TokenSource; |
| struct TokenSource |
| { |
| int i; /* index of next byte to use */ |
| uchar* data; /* all the data */ |
| int edata; /* data[0:edata] is valid */ |
| int chset; /* one of US_Ascii, etc. */ |
| int mtype; /* TextHtml or TextPlain */ |
| }; |
| |
| enum { |
| EOF = -2, |
| EOB = -1 |
| }; |
| |
| #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) |
| |
| #define SMALLBUFSIZE 240 |
| #define BIGBUFSIZE 2000 |
| |
| /* HTML 4.0 tag names. */ |
| /* Keep sorted, and in correspondence with enum in iparse.h. */ |
| Rune **tagnames; |
| char *_tagnames[] = { |
| " ", |
| "!", |
| "a", |
| "abbr", |
| "acronym", |
| "address", |
| "applet", |
| "area", |
| "b", |
| "base", |
| "basefont", |
| "bdo", |
| "big", |
| "blink", |
| "blockquote", |
| "body", |
| "bq", |
| "br", |
| "button", |
| "caption", |
| "center", |
| "cite", |
| "code", |
| "col", |
| "colgroup", |
| "dd", |
| "del", |
| "dfn", |
| "dir", |
| "div", |
| "dl", |
| "dt", |
| "em", |
| "fieldset", |
| "font", |
| "form", |
| "frame", |
| "frameset", |
| "h1", |
| "h2", |
| "h3", |
| "h4", |
| "h5", |
| "h6", |
| "head", |
| "hr", |
| "html", |
| "i", |
| "iframe", |
| "img", |
| "input", |
| "ins", |
| "isindex", |
| "kbd", |
| "label", |
| "legend", |
| "li", |
| "link", |
| "map", |
| "menu", |
| "meta", |
| "nobr", |
| "noframes", |
| "noscript", |
| "object", |
| "ol", |
| "optgroup", |
| "option", |
| "p", |
| "param", |
| "pre", |
| "q", |
| "s", |
| "samp", |
| "script", |
| "select", |
| "small", |
| "span", |
| "strike", |
| "strong", |
| "style", |
| "sub", |
| "sup", |
| "table", |
| "tbody", |
| "td", |
| "textarea", |
| "tfoot", |
| "th", |
| "thead", |
| "title", |
| "tr", |
| "tt", |
| "u", |
| "ul", |
| "var" |
| }; |
| |
| /* HTML 4.0 attribute names. */ |
| /* Keep sorted, and in correspondence with enum in i.h. */ |
| Rune **attrnames; |
| char* _attrnames[] = { |
| "abbr", |
| "accept-charset", |
| "access-key", |
| "action", |
| "align", |
| "alink", |
| "alt", |
| "archive", |
| "axis", |
| "background", |
| "bgcolor", |
| "border", |
| "cellpadding", |
| "cellspacing", |
| "char", |
| "charoff", |
| "charset", |
| "checked", |
| "cite", |
| "class", |
| "classid", |
| "clear", |
| "code", |
| "codebase", |
| "codetype", |
| "color", |
| "cols", |
| "colspan", |
| "compact", |
| "content", |
| "coords", |
| "data", |
| "datetime", |
| "declare", |
| "defer", |
| "dir", |
| "disabled", |
| "enctype", |
| "face", |
| "for", |
| "frame", |
| "frameborder", |
| "headers", |
| "height", |
| "href", |
| "hreflang", |
| "hspace", |
| "http-equiv", |
| "id", |
| "ismap", |
| "label", |
| "lang", |
| "link", |
| "longdesc", |
| "marginheight", |
| "marginwidth", |
| "maxlength", |
| "media", |
| "method", |
| "multiple", |
| "name", |
| "nohref", |
| "noresize", |
| "noshade", |
| "nowrap", |
| "object", |
| "onblur", |
| "onchange", |
| "onclick", |
| "ondblclick", |
| "onfocus", |
| "onkeypress", |
| "onkeyup", |
| "onload", |
| "onmousedown", |
| "onmousemove", |
| "onmouseout", |
| "onmouseover", |
| "onmouseup", |
| "onreset", |
| "onselect", |
| "onsubmit", |
| "onunload", |
| "profile", |
| "prompt", |
| "readonly", |
| "rel", |
| "rev", |
| "rows", |
| "rowspan", |
| "rules", |
| "scheme", |
| "scope", |
| "scrolling", |
| "selected", |
| "shape", |
| "size", |
| "span", |
| "src", |
| "standby", |
| "start", |
| "style", |
| "summary", |
| "tabindex", |
| "target", |
| "text", |
| "title", |
| "type", |
| "usemap", |
| "valign", |
| "value", |
| "valuetype", |
| "version", |
| "vlink", |
| "vspace", |
| "width" |
| }; |
| |
| |
| /* Character entity to unicode character number map. */ |
| /* Keep sorted by name. */ |
| StringInt *chartab; |
| AsciiInt _chartab[] = { |
| {"AElig", 198}, |
| {"Aacute", 193}, |
| {"Acirc", 194}, |
| {"Agrave", 192}, |
| {"Aring", 197}, |
| {"Atilde", 195}, |
| {"Auml", 196}, |
| {"Ccedil", 199}, |
| {"ETH", 208}, |
| {"Eacute", 201}, |
| {"Ecirc", 202}, |
| {"Egrave", 200}, |
| {"Euml", 203}, |
| {"Iacute", 205}, |
| {"Icirc", 206}, |
| {"Igrave", 204}, |
| {"Iuml", 207}, |
| {"Ntilde", 209}, |
| {"Oacute", 211}, |
| {"Ocirc", 212}, |
| {"Ograve", 210}, |
| {"Oslash", 216}, |
| {"Otilde", 213}, |
| {"Ouml", 214}, |
| {"THORN", 222}, |
| {"Uacute", 218}, |
| {"Ucirc", 219}, |
| {"Ugrave", 217}, |
| {"Uuml", 220}, |
| {"Yacute", 221}, |
| {"aacute", 225}, |
| {"acirc", 226}, |
| {"acute", 180}, |
| {"aelig", 230}, |
| {"agrave", 224}, |
| {"alpha", 945}, |
| {"amp", 38}, |
| {"aring", 229}, |
| {"atilde", 227}, |
| {"auml", 228}, |
| {"beta", 946}, |
| {"brvbar", 166}, |
| {"ccedil", 231}, |
| {"cdots", 8943}, |
| {"cedil", 184}, |
| {"cent", 162}, |
| {"chi", 967}, |
| {"copy", 169}, |
| {"curren", 164}, |
| {"ddots", 8945}, |
| {"deg", 176}, |
| {"delta", 948}, |
| {"divide", 247}, |
| {"eacute", 233}, |
| {"ecirc", 234}, |
| {"egrave", 232}, |
| {"emdash", 8212}, /* non-standard but commonly used */ |
| {"emsp", 8195}, |
| {"endash", 8211}, /* non-standard but commonly used */ |
| {"ensp", 8194}, |
| {"epsilon", 949}, |
| {"eta", 951}, |
| {"eth", 240}, |
| {"euml", 235}, |
| {"frac12", 189}, |
| {"frac14", 188}, |
| {"frac34", 190}, |
| {"gamma", 947}, |
| {"gt", 62}, |
| {"iacute", 237}, |
| {"icirc", 238}, |
| {"iexcl", 161}, |
| {"igrave", 236}, |
| {"iota", 953}, |
| {"iquest", 191}, |
| {"iuml", 239}, |
| {"kappa", 954}, |
| {"lambda", 955}, |
| {"laquo", 171}, |
| {"ldquo", 8220}, |
| {"ldots", 8230}, |
| {"lsquo", 8216}, |
| {"lt", 60}, |
| {"macr", 175}, |
| {"mdash", 8212}, |
| {"micro", 181}, |
| {"middot", 183}, |
| {"mu", 956}, |
| {"nbsp", 160}, |
| {"ndash", 8211}, |
| {"not", 172}, |
| {"ntilde", 241}, |
| {"nu", 957}, |
| {"oacute", 243}, |
| {"ocirc", 244}, |
| {"ograve", 242}, |
| {"omega", 969}, |
| {"omicron", 959}, |
| {"ordf", 170}, |
| {"ordm", 186}, |
| {"oslash", 248}, |
| {"otilde", 245}, |
| {"ouml", 246}, |
| {"para", 182}, |
| {"phi", 966}, |
| {"pi", 960}, |
| {"plusmn", 177}, |
| {"pound", 163}, |
| {"psi", 968}, |
| {"quad", 8193}, |
| {"quot", 34}, |
| {"raquo", 187}, |
| {"rdquo", 8221}, |
| {"reg", 174}, |
| {"rho", 961}, |
| {"rsquo", 8217}, |
| {"sect", 167}, |
| {"shy", 173}, |
| {"sigma", 963}, |
| {"sp", 8194}, |
| {"sup1", 185}, |
| {"sup2", 178}, |
| {"sup3", 179}, |
| {"szlig", 223}, |
| {"tau", 964}, |
| {"theta", 952}, |
| {"thinsp", 8201}, |
| {"thorn", 254}, |
| {"times", 215}, |
| {"trade", 8482}, |
| {"uacute", 250}, |
| {"ucirc", 251}, |
| {"ugrave", 249}, |
| {"uml", 168}, |
| {"upsilon", 965}, |
| {"uuml", 252}, |
| {"varepsilon", 8712}, |
| {"varphi", 981}, |
| {"varpi", 982}, |
| {"varrho", 1009}, |
| {"vdots", 8942}, |
| {"vsigma", 962}, |
| {"vtheta", 977}, |
| {"xi", 958}, |
| {"yacute", 253}, |
| {"yen", 165}, |
| {"yuml", 255}, |
| {"zeta", 950} |
| }; |
| #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0])) |
| |
| /* Characters Winstart..Winend are those that Windows */ |
| /* uses interpolated into the Latin1 set. */ |
| /* They aren't supposed to appear in HTML, but they do.... */ |
| enum { |
| Winstart = 127, |
| Winend = 159 |
| }; |
| |
| static int winchars[]= { 8226, /* 8226 is a bullet */ |
| 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, |
| 710, 8240, 352, 8249, 338, 8226, 8226, 8226, |
| 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, |
| 732, 8482, 353, 8250, 339, 8226, 8226, 376}; |
| |
| static StringInt* tagtable; /* initialized from tagnames */ |
| static StringInt* attrtable; /* initialized from attrnames */ |
| |
| static void lexinit(void); |
| static int getplaindata(TokenSource* ts, Token* a, int* pai); |
| static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); |
| static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); |
| static int gettag(TokenSource* ts, int starti, Token* a, int* pai); |
| static Rune* buftostr(Rune* s, Rune* buf, int j); |
| static int comment(TokenSource* ts); |
| static int findstr(TokenSource* ts, Rune* s); |
| static int ampersand(TokenSource* ts); |
| /*static int lowerc(int c); */ |
| static int getchar(TokenSource* ts); |
| static void ungetchar(TokenSource* ts, int c); |
| static void backup(TokenSource* ts, int savei); |
| /*static void freeinsidetoken(Token* t); */ |
| static void freeattrs(Attr* ahead); |
| static Attr* newattr(int attid, Rune* value, Attr* link); |
| static int Tconv(Fmt* f); |
| |
| int dbglex = 0; |
| static int lexinited = 0; |
| |
| static void |
| lexinit(void) |
| { |
| chartab = _cvtstringinttab(_chartab, nelem(_chartab)); |
| tagnames = _cvtstringtab(_tagnames, nelem(_tagnames)); |
| tagtable = _makestrinttab(tagnames, Numtags); |
| attrnames = _cvtstringtab(_attrnames, nelem(_attrnames)); |
| attrtable = _makestrinttab(attrnames, Numattrs); |
| fmtinstall('T', Tconv); |
| lexinited = 1; |
| } |
| |
| static TokenSource* |
| newtokensource(uchar* data, int edata, int chset, int mtype) |
| { |
| TokenSource* ans; |
| |
| assert(chset == US_Ascii || chset == ISO_8859_1 || |
| chset == UTF_8 || chset == Unicode); |
| ans = (TokenSource*)emalloc(sizeof(TokenSource)); |
| ans->i = 0; |
| ans->data = data; |
| ans->edata = edata; |
| ans->chset = chset; |
| ans->mtype = mtype; |
| return ans; |
| } |
| |
| enum { |
| ToksChunk = 500 |
| }; |
| |
| /* Call this to get the tokens. */ |
| /* The number of returned tokens is returned in *plen. */ |
| Token* |
| _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) |
| { |
| TokenSource* ts; |
| Token* a; |
| int alen; |
| int ai; |
| int starti; |
| int c; |
| int tag; |
| |
| if(!lexinited) |
| lexinit(); |
| ts = newtokensource(data, datalen, chset, mtype); |
| alen = ToksChunk; |
| a = (Token*)emalloc(alen * sizeof(Token)); |
| ai = 0; |
| if(dbglex) |
| fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); |
| if(ts->mtype == TextHtml){ |
| for(;;){ |
| if(ai == alen){ |
| a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); |
| alen += ToksChunk; |
| } |
| starti = ts->i; |
| c = getchar(ts); |
| if(c < 0) |
| break; |
| if(c == '<'){ |
| tag = gettag(ts, starti, a, &ai); |
| if(tag == Tscript){ |
| /* special rules for getting Data after.... */ |
| starti = ts->i; |
| c = getchar(ts); |
| tag = getscriptdata(ts, c, starti, a, &ai); |
| } |
| } |
| else |
| tag = getdata(ts, c, starti, a, &ai); |
| if(tag == -1) |
| break; |
| else if(dbglex > 1 && tag != Comment) |
| fprint(2, "lex: got token %T\n", &a[ai-1]); |
| } |
| } |
| else { |
| /* plain text (non-html) tokens */ |
| for(;;){ |
| if(ai == alen){ |
| a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); |
| alen += ToksChunk; |
| } |
| tag = getplaindata(ts, a, &ai); |
| if(tag == -1) |
| break; |
| if(dbglex > 1) |
| fprint(2, "lex: got token %T\n", &a[ai]); |
| } |
| } |
| if(dbglex) |
| fprint(2, "lex: returning %d tokens\n", ai); |
| *plen = ai; |
| if(ai == 0) |
| return nil; |
| return a; |
| } |
| |
| /* For case where source isn't HTML. */ |
| /* Just make data tokens, one per line (or partial line, */ |
| /* at end of buffer), ignoring non-whitespace control */ |
| /* characters and dumping \r's. */ |
| /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */ |
| /* Otherwise return -1; */ |
| static int |
| getplaindata(TokenSource* ts, Token* a, int* pai) |
| { |
| Rune* s; |
| int j; |
| int starti; |
| int c; |
| Token* tok; |
| Rune buf[BIGBUFSIZE]; |
| |
| s = nil; |
| j = 0; |
| starti = ts->i; |
| for(c = getchar(ts); c >= 0; c = getchar(ts)){ |
| if(c < ' '){ |
| if(isspace(c)){ |
| if(c == '\r'){ |
| /* ignore it unless no following '\n', */ |
| /* in which case treat it like '\n' */ |
| c = getchar(ts); |
| if(c != '\n'){ |
| if(c >= 0) |
| ungetchar(ts, c); |
| c = '\n'; |
| } |
| } |
| } |
| else |
| c = 0; |
| } |
| if(c != 0){ |
| buf[j++] = c; |
| if(j == sizeof(buf)-1){ |
| s = buftostr(s, buf, j); |
| j = 0; |
| } |
| } |
| if(c == '\n') |
| break; |
| } |
| s = buftostr(s, buf, j); |
| if(s == nil) |
| return -1; |
| tok = &a[(*pai)++]; |
| tok->tag = Data; |
| tok->text = s; |
| tok->attr = nil; |
| tok->starti = starti; |
| return Data; |
| } |
| |
| /* Return concatenation of s and buf[0:j] */ |
| static Rune* |
| buftostr(Rune* s, Rune* buf, int j) |
| { |
| buf[j] = 0; |
| if(s == nil) |
| s = _Strndup(buf, j); |
| else |
| s = _Strdup2(s, buf); |
| return s; |
| } |
| |
| /* Gather data up to next start-of-tag or end-of-buffer. */ |
| /* Translate entity references (&). */ |
| /* Ignore non-whitespace control characters and get rid of \r's. */ |
| /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */ |
| /* Otherwise return -1; */ |
| static int |
| getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) |
| { |
| Rune* s; |
| int j; |
| int c; |
| Token* tok; |
| Rune buf[BIGBUFSIZE]; |
| |
| s = nil; |
| j = 0; |
| c = firstc; |
| while(c >= 0){ |
| if(c == '&'){ |
| c = ampersand(ts); |
| if(c < 0) |
| break; |
| } |
| else if(c < ' '){ |
| if(isspace(c)){ |
| if(c == '\r'){ |
| /* ignore it unless no following '\n', */ |
| /* in which case treat it like '\n' */ |
| c = getchar(ts); |
| if(c != '\n'){ |
| if(c >= 0) |
| ungetchar(ts, c); |
| c = '\n'; |
| } |
| } |
| } |
| else { |
| if(warn) |
| fprint(2, "warning: non-whitespace control character %d ignored\n", c); |
| c = 0; |
| } |
| } |
| else if(c == '<'){ |
| ungetchar(ts, c); |
| break; |
| } |
| if(c != 0){ |
| buf[j++] = c; |
| if(j == BIGBUFSIZE-1){ |
| s = buftostr(s, buf, j); |
| j = 0; |
| } |
| } |
| c = getchar(ts); |
| } |
| s = buftostr(s, buf, j); |
| if(s == nil) |
| return -1; |
| tok = &a[(*pai)++]; |
| tok->tag = Data; |
| tok->text = s; |
| tok->attr = nil; |
| tok->starti = starti; |
| return Data; |
| } |
| |
| /* The rules for lexing scripts are different (ugh). */ |
| /* Gather up everything until see a </SCRIPT>. */ |
| static int |
| getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) |
| { |
| Rune* s; |
| int j; |
| int tstarti; |
| int savei; |
| int c; |
| int tag; |
| int done; |
| Token* tok; |
| Rune buf[BIGBUFSIZE]; |
| |
| s = nil; |
| j = 0; |
| tstarti = starti; |
| c = firstc; |
| done = 0; |
| while(c >= 0){ |
| if(c == '<'){ |
| /* other browsers ignore stuff to end of line after <! */ |
| savei = ts->i; |
| c = getchar(ts); |
| if(c == '!'){ |
| while(c >= 0 && c != '\n' && c != '\r') |
| c = getchar(ts); |
| if(c == '\r') |
| c = getchar(ts); |
| if(c == '\n') |
| c = getchar(ts); |
| } |
| else if(c >= 0){ |
| backup(ts, savei); |
| tag = gettag(ts, tstarti, a, pai); |
| if(tag == -1) |
| break; |
| if(tag != Comment) |
| (*pai)--; |
| backup(ts, tstarti); |
| if(tag == Tscript + RBRA){ |
| done = 1; |
| break; |
| } |
| /* here tag was not </SCRIPT>, so take as regular data */ |
| c = getchar(ts); |
| } |
| } |
| if(c < 0) |
| break; |
| if(c != 0){ |
| buf[j++] = c; |
| if(j == BIGBUFSIZE-1){ |
| s = buftostr(s, buf, j); |
| j = 0; |
| } |
| } |
| tstarti = ts->i; |
| c = getchar(ts); |
| } |
| if(done || ts->i == ts->edata){ |
| s = buftostr(s, buf, j); |
| tok = &a[(*pai)++]; |
| tok->tag = Data; |
| tok->text = s; |
| tok->attr = nil; |
| tok->starti = starti; |
| return Data; |
| } |
| backup(ts, starti); |
| return -1; |
| } |
| |
| /* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */ |
| /* ends before then, return -1). */ |
| /* If it's a tag, look up the name, gather the attributes, and return */ |
| /* the appropriate token. */ |
| /* Else it's either just plain data or some kind of ignorable stuff: */ |
| /* return Data or Comment as appropriate. */ |
| /* If it's not a Comment, put it in a[*pai] and bump *pai. */ |
| static int |
| gettag(TokenSource* ts, int starti, Token* a, int* pai) |
| { |
| int rbra; |
| int ans; |
| Attr* al; |
| int nexti; |
| int c; |
| int ti; |
| int afnd; |
| int attid; |
| int quote; |
| Rune* val; |
| int nv; |
| int i; |
| int tag; |
| Token* tok; |
| Rune buf[BIGBUFSIZE]; |
| |
| rbra = 0; |
| nexti = ts->i; |
| tok = &a[*pai]; |
| tok->tag = Notfound; |
| tok->text = nil; |
| tok->attr = nil; |
| tok->starti = starti; |
| c = getchar(ts); |
| if(c == '/'){ |
| rbra = RBRA; |
| c = getchar(ts); |
| } |
| if(c < 0) |
| goto eob_done; |
| if(c >= 256 || !isalpha(c)){ |
| /* not a tag */ |
| if(c == '!'){ |
| ans = comment(ts); |
| if(ans != -1) |
| return ans; |
| goto eob_done; |
| } |
| else { |
| backup(ts, nexti); |
| tok->tag = Data; |
| tok->text = _Strdup(L(Llt)); |
| (*pai)++; |
| return Data; |
| } |
| } |
| /* c starts a tagname */ |
| buf[0] = c; |
| i = 1; |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| if(!ISNAMCHAR(c)) |
| break; |
| /* if name is bigger than buf it won't be found anyway... */ |
| if(i < BIGBUFSIZE) |
| buf[i++] = c; |
| } |
| if(_lookup(tagtable, Numtags, buf, i, &tag)) |
| tok->tag = tag + rbra; |
| else |
| tok->text = _Strndup(buf, i); /* for warning print, in build */ |
| |
| /* attribute gathering loop */ |
| al = nil; |
| for(;;){ |
| /* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */ |
| /* skip whitespace */ |
| attrloop_continue: |
| while(c < 256 && isspace(c)){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| } |
| if(c == '>') |
| goto attrloop_done; |
| if(c == '<'){ |
| if(warn) |
| fprint(2, "warning: unclosed tag\n"); |
| ungetchar(ts, c); |
| goto attrloop_done; |
| } |
| if(c >= 256 || !isalpha(c)){ |
| if(warn) |
| fprint(2, "warning: expected attribute name\n"); |
| /* skipt to next attribute name */ |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| if(c < 256 && isalpha(c)) |
| goto attrloop_continue; |
| if(c == '<'){ |
| if(warn) |
| fprint(2, "warning: unclosed tag\n"); |
| ungetchar(ts, 60); |
| goto attrloop_done; |
| } |
| if(c == '>') |
| goto attrloop_done; |
| } |
| } |
| /* gather attribute name */ |
| buf[0] = c; |
| i = 1; |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| if(!ISNAMCHAR(c)) |
| break; |
| if(i < BIGBUFSIZE-1) |
| buf[i++] = c; |
| } |
| afnd = _lookup(attrtable, Numattrs, buf, i, &attid); |
| if(warn && !afnd){ |
| buf[i] = 0; |
| fprint(2, "warning: unknown attribute name %S\n", buf); |
| } |
| /* skip whitespace */ |
| while(c < 256 && isspace(c)){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| } |
| if(c != '='){ |
| if(afnd) |
| al = newattr(attid, nil, al); |
| goto attrloop_continue; |
| } |
| /*# c is '=' here; skip whitespace */ |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| if(c >= 256 || !isspace(c)) |
| break; |
| } |
| quote = 0; |
| if(c == '\'' || c == '"'){ |
| quote = c; |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| } |
| val = nil; |
| nv = 0; |
| for(;;){ |
| valloop_continue: |
| if(c < 0) |
| goto eob_done; |
| if(c == '>'){ |
| if(quote){ |
| /* c might be part of string (though not good style) */ |
| /* but if line ends before close quote, assume */ |
| /* there was an unmatched quote */ |
| ti = ts->i; |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| if(c == quote){ |
| backup(ts, ti); |
| buf[nv++] = '>'; |
| if(nv == BIGBUFSIZE-1){ |
| val = buftostr(val, buf, nv); |
| nv = 0; |
| } |
| c = getchar(ts); |
| goto valloop_continue; |
| } |
| if(c == '\n'){ |
| if(warn) |
| fprint(2, "warning: apparent unmatched quote\n"); |
| backup(ts, ti); |
| c = '>'; |
| goto valloop_done; |
| } |
| } |
| } |
| else |
| goto valloop_done; |
| } |
| if(quote){ |
| if(c == quote){ |
| c = getchar(ts); |
| if(c < 0) |
| goto eob_done; |
| goto valloop_done; |
| } |
| if(c == '\r'){ |
| c = getchar(ts); |
| goto valloop_continue; |
| } |
| if(c == '\t' || c == '\n') |
| c = ' '; |
| } |
| else { |
| if(c < 256 && isspace(c)) |
| goto valloop_done; |
| } |
| if(c == '&'){ |
| c = ampersand(ts); |
| if(c == -1) |
| goto eob_done; |
| } |
| buf[nv++] = c; |
| if(nv == BIGBUFSIZE-1){ |
| val = buftostr(val, buf, nv); |
| nv = 0; |
| } |
| c = getchar(ts); |
| } |
| valloop_done: |
| if(afnd){ |
| val = buftostr(val, buf, nv); |
| al = newattr(attid, val, al); |
| } |
| } |
| |
| attrloop_done: |
| tok->attr = al; |
| (*pai)++; |
| return tok->tag; |
| |
| eob_done: |
| if(warn) |
| fprint(2, "warning: incomplete tag at end of page\n"); |
| backup(ts, nexti); |
| tok->tag = Data; |
| tok->text = _Strdup(L(Llt)); |
| return Data; |
| } |
| |
| /* We've just read a '<!' at position starti, */ |
| /* so this may be a comment or other ignored section, or it may */ |
| /* be just a literal string if there is no close before end of file */ |
| /* (other browsers do that). */ |
| /* The accepted practice seems to be (note: contrary to SGML spec!): */ |
| /* If see <!--, look for --> to close, or if none, > to close. */ |
| /* If see <!(not --), look for > to close. */ |
| /* If no close before end of file, leave original characters in as literal data. */ |
| /* */ |
| /* If we see ignorable stuff, return Comment. */ |
| /* Else return nil (caller should back up and try again when more data arrives, */ |
| /* unless at end of file, in which case caller should just make '<' a data token). */ |
| static int |
| comment(TokenSource* ts) |
| { |
| int nexti; |
| int havecomment; |
| int c; |
| |
| nexti = ts->i; |
| havecomment = 0; |
| c = getchar(ts); |
| if(c == '-'){ |
| c = getchar(ts); |
| if(c == '-'){ |
| if(findstr(ts, L(Larrow))) |
| havecomment = 1; |
| else |
| backup(ts, nexti); |
| } |
| } |
| if(!havecomment){ |
| if(c == '>') |
| havecomment = 1; |
| else if(c >= 0){ |
| if(findstr(ts, L(Lgt))) |
| havecomment = 1; |
| } |
| } |
| if(havecomment) |
| return Comment; |
| return -1; |
| } |
| |
| /* Look for string s in token source. */ |
| /* If found, return 1, with buffer at next char after s, */ |
| /* else return 0 (caller should back up). */ |
| static int |
| findstr(TokenSource* ts, Rune* s) |
| { |
| int c0; |
| int n; |
| int nexti; |
| int i; |
| int c; |
| |
| c0 = s[0]; |
| n = runestrlen(s); |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| break; |
| if(c == c0){ |
| if(n == 1) |
| return 1; |
| nexti = ts->i; |
| for(i = 1; i < n; i++){ |
| c = getchar(ts); |
| if(c < 0) |
| goto mainloop_done; |
| if(c != s[i]) |
| break; |
| } |
| if(i == n) |
| return 1; |
| backup(ts, nexti); |
| } |
| } |
| mainloop_done: |
| return 0; |
| } |
| |
| static int |
| xdigit(int c) |
| { |
| if('0' <= c && c <= '9') |
| return c-'0'; |
| if('a' <= c && c <= 'f') |
| return c-'a'+10; |
| if('A' <= c && c <= 'F') |
| return c-'A'+10; |
| return -1; |
| } |
| |
| /* We've just read an '&'; look for an entity reference */ |
| /* name, and if found, return translated char. */ |
| /* if there is a complete entity name but it isn't known, */ |
| /* try prefixes (gets around some buggy HTML out there), */ |
| /* and if that fails, back up to just past the '&' and return '&'. */ |
| /* If the entity can't be completed in the current buffer, back up */ |
| /* to the '&' and return -1. */ |
| static int |
| ampersand(TokenSource* ts) |
| { |
| int savei; |
| int c; |
| int fnd; |
| int ans; |
| int v; |
| int i; |
| int k; |
| Rune buf[SMALLBUFSIZE]; |
| |
| savei = ts->i; |
| c = getchar(ts); |
| fnd = 0; |
| ans = -1; |
| if(c == '#'){ |
| c = getchar(ts); |
| v = 0; |
| if(c == 'x'){ |
| c = getchar(ts); |
| while((i=xdigit(c)) != -1){ |
| v = v*16 + i; |
| c = getchar(ts); |
| } |
| }else{ |
| while('0' <= c && c <= '9'){ |
| v = v*10 + c - '0'; |
| c = getchar(ts); |
| } |
| } |
| if(c >= 0){ |
| if(!(c == ';' || c == '\n' || c == '\r')) |
| ungetchar(ts, c); |
| c = v; |
| if(c == 160) |
| c = 160; |
| if(c >= Winstart && c <= Winend){ |
| c = winchars[c - Winstart]; |
| } |
| ans = c; |
| fnd = 1; |
| } |
| } |
| else if(c < 256 && isalpha(c)){ |
| buf[0] = c; |
| k = 1; |
| for(;;){ |
| c = getchar(ts); |
| if(c < 0) |
| break; |
| if(ISNAMCHAR(c)){ |
| if(k < SMALLBUFSIZE-1) |
| buf[k++] = c; |
| } |
| else { |
| if(!(c == ';' || c == '\n' || c == '\r')) |
| ungetchar(ts, c); |
| break; |
| } |
| } |
| if(c >= 0){ |
| fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); |
| if(!fnd){ |
| /* Try prefixes of s */ |
| if(c == ';' || c == '\n' || c == '\r') |
| ungetchar(ts, c); |
| i = k; |
| while(--k > 0){ |
| fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); |
| if(fnd){ |
| while(i > k){ |
| i--; |
| ungetchar(ts, buf[i]); |
| } |
| break; |
| } |
| } |
| } |
| } |
| } |
| if(!fnd){ |
| backup(ts, savei); |
| ans = '&'; |
| } |
| return ans; |
| } |
| |
| /* Get next char, obeying ts.chset. */ |
| /* Returns -1 if no complete character left before current end of data. */ |
| static int |
| getchar(TokenSource* ts) |
| { |
| uchar* buf; |
| int c; |
| int n; |
| int ok; |
| Rune r; |
| |
| if(ts->i >= ts->edata) |
| return -1; |
| buf = ts->data; |
| c = buf[ts->i]; |
| switch(ts->chset){ |
| case ISO_8859_1: |
| if(c >= Winstart && c <= Winend) |
| c = winchars[c - Winstart]; |
| ts->i++; |
| break; |
| case US_Ascii: |
| if(c > 127){ |
| if(warn) |
| fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); |
| } |
| ts->i++; |
| break; |
| case UTF_8: |
| ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); |
| n = chartorune(&r, (char*)(buf+ts->i)); |
| if(ok){ |
| if(warn && c == 0x80) |
| fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); |
| ts->i += n; |
| c = r; |
| } |
| else { |
| /* not enough bytes in buf to complete utf-8 char */ |
| ts->i = ts->edata; /* mark "all used" */ |
| c = -1; |
| } |
| break; |
| case Unicode: |
| if(ts->i < ts->edata - 1){ |
| /*standards say most-significant byte first */ |
| c = (c << 8)|(buf[ts->i + 1]); |
| ts->i += 2; |
| } |
| else { |
| ts->i = ts->edata; /* mark "all used" */ |
| c = -1; |
| } |
| break; |
| } |
| return c; |
| } |
| |
| /* Assuming c was the last character returned by getchar, set */ |
| /* things up so that next getchar will get that same character */ |
| /* followed by the current 'next character', etc. */ |
| static void |
| ungetchar(TokenSource* ts, int c) |
| { |
| int n; |
| Rune r; |
| char a[UTFmax]; |
| |
| n = 1; |
| switch(ts->chset){ |
| case UTF_8: |
| if(c >= 128){ |
| r = c; |
| n = runetochar(a, &r); |
| } |
| break; |
| case Unicode: |
| n = 2; |
| break; |
| } |
| ts->i -= n; |
| } |
| |
| /* Restore ts so that it is at the state where the index was savei. */ |
| static void |
| backup(TokenSource* ts, int savei) |
| { |
| if(dbglex) |
| fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); |
| ts->i = savei; |
| } |
| |
| |
| /* Look for value associated with attribute attid in token t. */ |
| /* If there is one, return 1 and put the value in *pans, */ |
| /* else return 0. */ |
| /* If xfer is true, transfer ownership of the string to the caller */ |
| /* (nil it out here); otherwise, caller must duplicate the answer */ |
| /* if it needs to save it. */ |
| /* OK to have pans==0, in which case this is just looking */ |
| /* to see if token is present. */ |
| int |
| _tokaval(Token* t, int attid, Rune** pans, int xfer) |
| { |
| Attr* attr; |
| |
| attr = t->attr; |
| while(attr != nil){ |
| if(attr->attid == attid){ |
| if(pans != nil) |
| *pans = attr->value; |
| if(xfer) |
| attr->value = nil; |
| return 1; |
| } |
| attr = attr->next; |
| } |
| if(pans != nil) |
| *pans = nil; |
| return 0; |
| } |
| |
| static int |
| Tconv(Fmt *f) |
| { |
| Token* t; |
| int i; |
| int tag; |
| char* srbra; |
| Rune* aname; |
| Rune* tname; |
| Attr* a; |
| char buf[BIGBUFSIZE]; |
| |
| t = va_arg(f->args, Token*); |
| if(t == nil) |
| sprint(buf, "<null>"); |
| else { |
| i = 0; |
| if(dbglex > 1) |
| i = snprint(buf, sizeof(buf), "[%d]", t->starti); |
| tag = t->tag; |
| if(tag == Data){ |
| i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); |
| } |
| else { |
| srbra = ""; |
| if(tag >= RBRA){ |
| tag -= RBRA; |
| srbra = "/"; |
| } |
| tname = tagnames[tag]; |
| if(tag == Notfound) |
| tname = L(Lquestion); |
| i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); |
| for(a = t->attr; a != nil; a = a->next){ |
| aname = attrnames[a->attid]; |
| i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); |
| if(a->value != nil) |
| i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); |
| } |
| i += snprint(buf+i, sizeof(buf)-i-1, ">"); |
| } |
| buf[i] = 0; |
| } |
| return fmtstrcpy(f, buf); |
| } |
| |
| /* Attrs own their constituent strings, but build may eventually */ |
| /* transfer some values to its items and nil them out in the Attr. */ |
| static Attr* |
| newattr(int attid, Rune* value, Attr* link) |
| { |
| Attr* ans; |
| |
| ans = (Attr*)emalloc(sizeof(Attr)); |
| ans->attid = attid; |
| ans->value = value; |
| ans->next = link; |
| return ans; |
| } |
| |
| /* Free list of Attrs linked through next field */ |
| static void |
| freeattrs(Attr* ahead) |
| { |
| Attr* a; |
| Attr* nexta; |
| |
| a = ahead; |
| while(a != nil){ |
| nexta = a->next; |
| free(a->value); |
| free(a); |
| a = nexta; |
| } |
| } |
| |
| /* Free array of Tokens. */ |
| /* Allocated space might have room for more than n tokens, */ |
| /* but only n of them are initialized. */ |
| /* If caller has transferred ownership of constitutent strings */ |
| /* or attributes, it must have nil'd out the pointers in the Tokens. */ |
| void |
| _freetokens(Token* tarray, int n) |
| { |
| int i; |
| Token* t; |
| |
| if(tarray == nil) |
| return; |
| for(i = 0; i < n; i++){ |
| t = &tarray[i]; |
| free(t->text); |
| freeattrs(t->attr); |
| } |
| free(tarray); |
| } |