Import version of libhtml that might actually work with ANSI C.

commit: 7cf289ca89a7416999ae02330236042b0d37e3db [log] [tgz]
author: wkj <devnull@localhost> Tue Apr 06 19:06:52 2004 +0000
committer: wkj <devnull@localhost> Tue Apr 06 19:06:52 2004 +0000
tree: 796d1363a7a53c72c28b199758ee674f1326a510
parent: 3e3817f7c86658f60715dd93768eaf8285807985 [diff] [blame]
diff --git a/src/libhtml/utils.c b/src/libhtml/utils.c
new file mode 100644
index 0000000..db22bba
--- /dev/null
+++ b/src/libhtml/utils.c

@@ -0,0 +1,591 @@
+#include <u.h>
+#include <libc.h>
+#include <draw.h>
+#include <html.h>
+#include "impl.h"
+
+Rune whitespace[] = { ' ', '\t', '\n', '\r', '\0' };
+Rune notwhitespace[] = { '^', ' ', '\t', '\n', '\r' , '\0'};
+
+// All lists start out like List structure.
+// List itself can be used as list of int.
+int
+_listlen(List* l)
+{
+	int n = 0;
+
+	while(l != nil) {
+		l = l->next;
+		n++;
+	}
+	return n;
+}
+
+// Cons
+List*
+_newlist(int val, List* rest)
+{
+	List* ans;
+
+	ans = (List*)emalloc(sizeof(List));
+	ans->val = val;
+	ans->next = rest;
+	return ans;
+}
+
+// Reverse a list in place
+List*
+_revlist(List* l)
+{
+	List* newl;
+	List* nextl;
+
+	newl = nil;
+	while(l != nil) {
+		nextl = l->next;
+		l->next = newl;
+		newl = l;
+		l = nextl;
+	}
+	return newl;
+}
+
+// The next few routines take a "character class" as argument.
+//    e.g., "a-zA-Z", or "^ \t\n"
+// (ranges indicated by - except in first position;
+//  ^ is first position means "not in" the following class)
+
+// Splitl splits s[0:n] just before first character of class cl.
+// Answers go in (p1, n1) and (p2, n2).
+// If no split, the whole thing goes in the first component.
+// Note: answers contain pointers into original string.
+void
+_splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
+{
+	Rune* p;
+
+	p = _Strnclass(s, cl, n);
+	*p1 = s;
+	if(p == nil) {
+		*n1 = n;
+		*p2 = nil;
+		*n2 = 0;
+	}
+	else {
+		*p2 = p;
+		*n1 = p-s;
+		*n2 = n-*n1;
+	}
+}
+
+// Splitr splits s[0:n] just after last character of class cl.
+// Answers go in (p1, n1) and (p2, n2).
+// If no split, the whole thing goes in the last component.
+// Note: answers contain pointers into original string.
+void
+_splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
+{
+	Rune* p;
+
+	p = _Strnrclass(s, cl, n);
+	if(p == nil) {
+		*p1 = nil;
+		*n1 = 0;
+		*p2 = s;
+		*n2 = n;
+	}
+	else {
+		*p1 = s;
+		*p2 = p+1;
+		*n1 = *p2-s;
+		*n2 = n-*n1;
+	}
+}
+
+// Splitall splits s[0:n] into parts that are separated by characters from class cl.
+// Each part will have nonzero length.
+// At most alen parts are found, and pointers to their starts go into
+// the strarr array, while their lengths go into the lenarr array.
+// The return value is the number of parts found.
+int
+_splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen)
+{
+	int i;
+	Rune* p;
+	Rune* q;
+	Rune* slast;
+
+	if(s == nil || n == 0)
+		return 0;
+	i = 0;
+	p = s;
+	slast = s+n;
+	while(p < slast && i < alen) {
+		while(p < slast && _inclass(*p, cl))
+			p++;
+		if(p == slast)
+			break;
+		q = _Strnclass(p, cl, slast-p);
+		if(q == nil)
+			q = slast;
+		assert(q > p && q <= slast);
+		strarr[i] = p;
+		lenarr[i] = q-p;
+		i++;
+		p = q;
+	}
+	return i;
+}
+
+// Find part of s that excludes leading and trailing whitespace,
+// and return that part in *pans (and its length in *panslen).
+void
+_trimwhite(Rune* s, int n, Rune** pans, int* panslen)
+{
+	Rune* p;
+	Rune* q;
+
+	p = nil;
+	if(n > 0) {
+		p = _Strnclass(s, notwhitespace, n);
+		if(p != nil) {
+			q = _Strnrclass(s, notwhitespace, n);
+			assert(q != nil);
+			n = q+1-p;
+		}
+	}
+	*pans = p;
+	*panslen = n;
+}
+
+// _Strclass returns a pointer to the first element of s that is
+// a member of class cl, nil if none.
+Rune*
+_Strclass(Rune* s, Rune* cl)
+{
+	Rune* p;
+
+	for(p = s; *p != 0; p++)
+		if(_inclass(*p, cl))
+			return p;
+	return nil;
+}
+
+// _Strnclass returns a pointer to the first element of s[0:n] that is
+// a member of class cl, nil if none.
+Rune*
+_Strnclass(Rune* s, Rune* cl, int n)
+{
+	Rune* p;
+
+	for(p = s; n-- && *p != 0; p++)
+		if(_inclass(*p, cl))
+			return p;
+	return nil;
+}
+
+// _Strrclass returns a pointer to the last element of s that is
+// a member of class cl, nil if none
+Rune*
+_Strrclass(Rune* s, Rune* cl)
+{
+	Rune* p;
+
+	if(s == nil || *s == 0)
+		return nil;
+	p = s + runestrlen(s) - 1;
+	while(p >= s) {
+		if(_inclass(*p, cl))
+			return p;
+		p--;
+	};
+	return nil;
+}
+
+// _Strnrclass returns a pointer to the last element of s[0:n] that is
+// a member of class cl, nil if none
+Rune*
+_Strnrclass(Rune* s, Rune* cl, int n)
+{
+	Rune* p;
+
+	if(s == nil || *s == 0 || n == 0)
+		return nil;
+	p = s + n - 1;
+	while(p >= s) {
+		if(_inclass(*p, cl))
+			return p;
+		p--;
+	};
+	return nil;
+}
+
+// Is c in the class cl?
+int
+_inclass(Rune c, Rune* cl)
+{
+	int	n;
+	int	ans;
+	int	negate;
+	int	i;
+
+	n = _Strlen(cl);
+	if(n == 0)
+		return 0;
+	ans = 0;
+	negate = 0;
+	if(cl[0] == '^') {
+		negate = 1;
+		cl++;
+		n--;
+	}
+	for(i = 0; i < n; i++) {
+		if(cl[i] == '-' && i > 0 && i < n - 1) {
+			if(c >= cl[i - 1] && c <= cl[i + 1]) {
+				ans = 1;
+				break;
+			}
+			i++;
+		}
+		else if(c == cl[i]) {
+			ans = 1;
+			break;
+		}
+	}
+	if(negate)
+		ans = !ans;
+	return ans;
+}
+
+// Is pre a prefix of s?
+int
+_prefix(Rune* pre, Rune* s)
+{
+	int	ns;
+	int	n;
+	int	k;
+
+	ns = _Strlen(s);
+	n = _Strlen(pre);
+	if(ns < n)
+		return 0;
+	for(k = 0; k < n; k++) {
+		if(pre[k] != s[k])
+			return 0;
+	}
+	return 1;
+}
+
+// Number of runes in (null-terminated) s
+int
+_Strlen(Rune* s)
+{
+	if(s == nil)
+		return 0;
+	return runestrlen(s);
+}
+
+// -1, 0, 1 as s1 is lexicographically less, equal greater than s2
+int
+_Strcmp(Rune *s1, Rune *s2)
+{
+	if(s1 == nil)
+		return (s2 == nil || *s2 == 0) ? 0 : -1;
+	if(s2 == nil)
+		return (*s1 == 0) ? 0 : 1;
+	return runestrcmp(s1, s2);
+}
+
+// Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars).
+// Also, do a case-insensitive match, assuming s2
+// has no chars in [A-Z], only their lowercase versions.
+// (This routine is used for in-place keyword lookup, where s2 is in a keyword
+// list and s1 is some substring, possibly mixed-case, in a buffer.)
+int
+_Strncmpci(Rune *s1, int n1, Rune *s2)
+{
+	Rune c1, c2;
+
+	for(;;) {
+		if(n1-- == 0) {
+			if(*s2 == 0)
+				return 0;
+			return -1;
+		}
+		c1 = *s1++;
+		c2 = *s2++;
+		if(c1 >= 'A' && c1 <= 'Z')
+			c1 = c1 - 'A' + 'a';
+		if(c1 != c2) {
+			if(c1 > c2)
+				return 1;
+			return -1;
+		}
+	}
+}
+
+// emalloc and copy
+Rune*
+_Strdup(Rune* s)
+{
+	if(s == nil)
+		return nil;
+	return _Strndup(s, runestrlen(s));
+}
+
+// emalloc and copy n chars of s (assume s is at least that long),
+// and add 0 terminator.
+// Return nil if n==0.
+Rune*
+_Strndup(Rune* s, int n)
+{
+	Rune* ans;
+
+	if(n <= 0)
+		return nil;
+	ans = _newstr(n);
+	memmove(ans, s, n*sizeof(Rune));
+	ans[n] = 0;
+	return ans;
+}
+// emalloc enough room for n Runes, plus 1 null terminator.
+// (Not initialized to anything.)
+Rune*
+_newstr(int n)
+{
+	return (Rune*)emalloc((n+1)*sizeof(Rune));
+}
+
+// emalloc and copy s+t
+Rune*
+_Strdup2(Rune* s, Rune* t)
+{
+	int ns, nt;
+	Rune* ans;
+	Rune* p;
+
+	ns = _Strlen(s);
+	nt = _Strlen(t);
+	if(ns+nt == 0)
+		return nil;
+	ans = _newstr(ns+nt);
+	p = _Stradd(ans, s, ns);
+	p = _Stradd(p, t, nt);
+	*p = 0;
+	return ans;
+}
+
+// Return emalloc'd substring s[start:stop],
+Rune*
+_Strsubstr(Rune* s, int start, int stop)
+{
+	Rune* t;
+
+	if(start == stop)
+		return nil;
+	t = _Strndup(s+start, stop-start);
+	return t;
+}
+
+// Copy n chars to s1 from s2, and return s1+n
+Rune*
+_Stradd(Rune* s1, Rune* s2, int n)
+{
+	if(n == 0)
+		return s1;
+	memmove(s1, s2, n*sizeof(Rune));
+	return s1+n;
+}
+
+// Like strtol, but converting from Rune* string
+
+//#define LONG_MAX	2147483647L
+//#define LONG_MIN	-2147483648L
+
+long
+_Strtol(Rune* nptr, Rune** endptr, int base)
+{
+	Rune* p;
+	long n, nn;
+	int c, ovfl, v, neg, ndig;
+
+	p = nptr;
+	neg = 0;
+	n = 0;
+	ndig = 0;
+	ovfl = 0;
+
+	/*
+	 * White space
+	 */
+	for(;;p++){
+		switch(*p){
+		case ' ':
+		case '\t':
+		case '\n':
+		case '\f':
+		case '\r':
+		case '\v':
+			continue;
+		}
+		break;
+	}
+
+	/*
+	 * Sign
+	 */
+	if(*p=='-' || *p=='+')
+		if(*p++ == '-')
+			neg = 1;
+
+	/*
+	 * Base
+	 */
+	if(base==0){
+		if(*p != '0')
+			base = 10;
+		else{
+			base = 8;
+			if(p[1]=='x' || p[1]=='X'){
+				p += 2;
+				base = 16;
+			}
+		}
+	}else if(base==16 && *p=='0'){
+		if(p[1]=='x' || p[1]=='X')
+			p += 2;
+	}else if(base<0 || 36<base)
+		goto Return;
+
+	/*
+	 * Non-empty sequence of digits
+	 */
+	for(;; p++,ndig++){
+		c = *p;
+		v = base;
+		if('0'<=c && c<='9')
+			v = c - '0';
+		else if('a'<=c && c<='z')
+			v = c - 'a' + 10;
+		else if('A'<=c && c<='Z')
+			v = c - 'A' + 10;
+		if(v >= base)
+			break;
+		nn = n*base + v;
+		if(nn < n)
+			ovfl = 1;
+		n = nn;
+	}
+
+    Return:
+	if(ndig == 0)
+		p = nptr;
+	if(endptr)
+		*endptr = p;
+	if(ovfl){
+		if(neg)
+			return LONG_MIN;
+		return LONG_MAX;
+	}
+	if(neg)
+		return -n;
+	return n;
+}
+
+// Convert buf[0:n], bytes whose character set is chset,
+// into a emalloc'd null-terminated Unicode string.
+Rune*
+toStr(uchar* buf, int n, int chset)
+{
+	int i;
+	int m;
+	Rune ch;
+	Rune* ans;
+
+	switch(chset) {
+	case US_Ascii:
+	case ISO_8859_1:
+		ans = (Rune*)emalloc((n+1)*sizeof(Rune));
+		for(i = 0; i < n; i++)
+			ans[i] = buf[i];
+		ans[n] = 0;
+		break;
+
+	case UTF_8:
+		m = 0;
+		for(i = 0; i < n; ) {
+			i += chartorune(&ch, (char*)(buf+i));
+			m++;
+		}
+		ans = (Rune*)emalloc((m+1)*sizeof(Rune));
+		m = 0;
+		for(i = 0; i < n; ) {
+			i += chartorune(&ch, (char*)(buf+i));
+			ans[m++] = ch;
+		}
+		ans[m] = 0;
+		break;
+
+	default:
+		ans = nil;
+		assert(0);
+	}
+	return ans;
+}
+
+// Convert buf[0:n], Unicode characters,
+// into an emalloc'd null-terminated string in character set chset.
+// Use 0x80 for unconvertable characters.
+uchar*
+fromStr(Rune* buf, int n, int chset)
+{
+	uchar* ans;
+	int i, lim, m;
+	Rune ch;
+	uchar* p;
+	uchar s[UTFmax];
+
+	ans = nil;
+	switch(chset) {
+	case US_Ascii:
+	case ISO_8859_1:
+		ans = (uchar*)emalloc(n+1);
+		lim = (chset==US_Ascii)? 127 : 255;
+		for(i = 0; i < n; i++) {
+			ch = buf[i];
+			if(ch > lim)
+				ch = 0x80;
+			ans[i] = ch;
+		}
+		ans[n] = 0;
+		break;
+
+	case UTF_8:
+		m = 0;
+		for(i = 0; i < n; i++) {
+			m += runetochar((char*)s, &buf[i]);
+		}
+		ans = (uchar*)emalloc(m+1);
+		p = ans;
+		for(i = 0; i < n; i++)
+			p += runetochar((char*)p, &buf[i]);
+		*p = 0;
+		break;
+
+	default:
+		assert(0);
+	}
+	return ans;
+
+}
+
+// Convert n to emalloc'd String.
+Rune*
+_ltoStr(int n)
+{
+	int m;
+	uchar buf[20];
+
+	m = snprint((char*)buf, sizeof(buf), "%d", n);
+	return toStr(buf, m, US_Ascii);
+}
commit	7cf289ca89a7416999ae02330236042b0d37e3db	[log] [tgz]
author	wkj <devnull@localhost>	Tue Apr 06 19:06:52 2004 +0000
committer	wkj <devnull@localhost>	Tue Apr 06 19:06:52 2004 +0000
tree	796d1363a7a53c72c28b199758ee674f1326a510
parent	3e3817f7c86658f60715dd93768eaf8285807985 [diff] [blame]