| #include <u.h> |
| #include <libc.h> |
| #include <bio.h> |
| #include <draw.h> |
| #include <regexp.h> |
| #include <html.h> |
| #include <ctype.h> |
| #include "dat.h" |
| |
| char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; |
| Reprog *urlprog; |
| |
| int inword = 0; |
| int col = 0; |
| int wordi = 0; |
| |
| char* |
| loadhtml(int fd) |
| { |
| URLwin *u; |
| Bytes *b; |
| int n; |
| char buf[4096]; |
| |
| u = emalloc(sizeof(URLwin)); |
| u->infd = fd; |
| u->outfd = 1; |
| u->url = estrdup(url); |
| u->type = TextHtml; |
| |
| b = emalloc(sizeof(Bytes)); |
| while((n = read(fd, buf, sizeof buf)) > 0) |
| growbytes(b, buf, n); |
| if(b->b == nil) |
| return nil; /* empty file */ |
| rendertext(u, b); |
| freeurlwin(u); |
| return nil; |
| } |
| |
| char* |
| runetobyte(Rune *r, int n) |
| { |
| char *s; |
| |
| if(n == 0) |
| return emalloc(1); |
| s = smprint("%.*S", n, r); |
| if(s == nil) |
| error("malloc failed"); |
| return s; |
| } |
| |
| int |
| closingpunct(int c) |
| { |
| return strchr(".,:;'\")]}>!?", c) != nil; |
| } |
| |
| void |
| emitword(Bytes *b, Rune *r, int nr) |
| { |
| char *s; |
| int space; |
| |
| if(nr == 0) |
| return; |
| s = smprint("%.*S", nr, r); |
| space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]); |
| if(col>0 && col+space+nr > width){ |
| growbytes(b, "\n", 1); |
| space = 0; |
| col = 0; |
| } |
| if(space && col>0){ |
| growbytes(b, " ", 1); |
| col++; |
| } |
| growbytes(b, s, strlen(s)); |
| col += nr; |
| free(s); |
| inword = 0; |
| } |
| |
| void |
| renderrunes(Bytes *b, Rune *r) |
| { |
| int i, n; |
| |
| n = runestrlen(r); |
| for(i=0; i<n; i++){ |
| switch(r[i]){ |
| case '\n': |
| if(inword) |
| emitword(b, r+wordi, i-wordi); |
| col = 0; |
| if(b->n == 0) |
| break; /* don't start with blank lines */ |
| if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') |
| growbytes(b, "\n", 1); |
| break; |
| case ' ': |
| if(inword) |
| emitword(b, r+wordi, i-wordi); |
| break; |
| default: |
| if(!inword) |
| wordi = i; |
| inword = 1; |
| break; |
| } |
| } |
| if(inword) |
| emitword(b, r+wordi, i-wordi); |
| } |
| |
| void |
| renderbytes(Bytes *b, char *fmt, ...) |
| { |
| Rune *r; |
| va_list arg; |
| |
| va_start(arg, fmt); |
| r = runevsmprint(fmt, arg); |
| va_end(arg); |
| renderrunes(b, r); |
| free(r); |
| } |
| |
| char* |
| baseurl(char *url) |
| { |
| char *base, *slash; |
| Resub rs[10]; |
| |
| if(url == nil) |
| return nil; |
| if(urlprog == nil){ |
| urlprog = regcomp(urlexpr); |
| if(urlprog == nil) |
| error("can't compile URL regexp"); |
| } |
| memset(rs, 0, sizeof rs); |
| if(regexec(urlprog, url, rs, nelem(rs)) == 0) |
| return nil; |
| base = estrdup(url); |
| slash = strrchr(base, '/'); |
| if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp]) |
| *slash = '\0'; |
| else |
| base[rs[0].e.ep-rs[0].s.sp] = '\0'; |
| return base; |
| } |
| |
| char* |
| fullurl(URLwin *u, Rune *rhref) |
| { |
| char *base, *href, *hrefbase; |
| char *result; |
| |
| if(rhref == nil) |
| return estrdup("NULL URL"); |
| href = runetobyte(rhref, runestrlen(rhref)); |
| hrefbase = baseurl(href); |
| result = nil; |
| if(hrefbase==nil && (base = baseurl(u->url))!=nil){ |
| result = estrdup(base); |
| if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) |
| result = eappend(result, "/", ""); |
| free(base); |
| } |
| if(href){ |
| if(result) |
| result = eappend(result, "", href); |
| else |
| result = estrdup(href); |
| } |
| free(hrefbase); |
| if(result == nil) |
| return estrdup("***unknown***"); |
| return result; |
| } |
| |
| void |
| render(URLwin *u, Bytes *t, Item *items, int curanchor) |
| { |
| Item *il; |
| Itext *it; |
| Ifloat *ifl; |
| Ispacer *is; |
| Itable *ita; |
| Iimage *im; |
| Anchor *a; |
| Table *tab; |
| Tablecell *cell; |
| char *href; |
| |
| inword = 0; |
| col = 0; |
| wordi = 0; |
| |
| for(il=items; il!=nil; il=il->next){ |
| if(il->state & IFbrk) |
| renderbytes(t, "\n"); |
| if(il->state & IFbrksp) |
| renderbytes(t, "\n"); |
| |
| switch(il->tag){ |
| case Itexttag: |
| it = (Itext*)il; |
| renderrunes(t, it->s); |
| break; |
| case Iruletag: |
| if(t->n>0 && t->b[t->n-1]!='\n') |
| renderbytes(t, "\n"); |
| renderbytes(t, "=======\n"); |
| break; |
| case Iimagetag: |
| if(!aflag) |
| break; |
| im = (Iimage*)il; |
| if(im->imsrc){ |
| href = fullurl(u, im->imsrc); |
| renderbytes(t, "[image %s]", href); |
| free(href); |
| } |
| break; |
| case Iformfieldtag: |
| if(aflag) |
| renderbytes(t, "[formfield]"); |
| break; |
| case Itabletag: |
| ita = (Itable*)il; |
| tab = ita->table; |
| for(cell=tab->cells; cell!=nil; cell=cell->next){ |
| render(u, t, cell->content, curanchor); |
| } |
| if(t->n>0 && t->b[t->n-1]!='\n') |
| renderbytes(t, "\n"); |
| break; |
| case Ifloattag: |
| ifl = (Ifloat*)il; |
| render(u, t, ifl->item, curanchor); |
| break; |
| case Ispacertag: |
| is = (Ispacer*)il; |
| if(is->spkind != ISPnull) |
| renderbytes(t, " "); |
| break; |
| default: |
| error("unknown item tag %d\n", il->tag); |
| } |
| if(il->anchorid != 0 && il->anchorid!=curanchor){ |
| for(a=u->docinfo->anchors; a!=nil; a=a->next) |
| if(aflag && a->index == il->anchorid){ |
| href = fullurl(u, a->href); |
| renderbytes(t, "[%s]", href); |
| free(href); |
| break; |
| } |
| curanchor = il->anchorid; |
| } |
| } |
| if(t->n>0 && t->b[t->n-1]!='\n') |
| renderbytes(t, "\n"); |
| } |
| |
| void |
| rerender(URLwin *u) |
| { |
| Bytes *t; |
| |
| t = emalloc(sizeof(Bytes)); |
| |
| render(u, t, u->items, 0); |
| |
| if(t->n) |
| write(u->outfd, (char*)t->b, t->n); |
| free(t->b); |
| free(t); |
| } |
| |
| /* |
| * Somewhat of a hack. Not a full parse, just looks for strings in the beginning |
| * of the document (cistrstr only looks at first somewhat bytes). |
| */ |
| int |
| charset(char *s) |
| { |
| char *meta, *emeta, *charset; |
| |
| if(defcharset == 0) |
| defcharset = ISO_8859_1; |
| meta = cistrstr(s, "<meta"); |
| if(meta == nil) |
| return defcharset; |
| for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) |
| ; |
| charset = cistrstr(s, "charset="); |
| if(charset == nil) |
| return defcharset; |
| charset += 8; |
| if(*charset == '"') |
| charset++; |
| if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) |
| return UTF_8; |
| return defcharset; |
| } |
| |
| void |
| rendertext(URLwin *u, Bytes *b) |
| { |
| Rune *rurl; |
| |
| rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); |
| u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); |
| /* free(rurl); */ |
| |
| rerender(u); |
| } |
| |
| |
| void |
| freeurlwin(URLwin *u) |
| { |
| freeitems(u->items); |
| u->items = nil; |
| freedocinfo(u->docinfo); |
| u->docinfo = nil; |
| free(u); |
| } |