From 7cf289ca89a7416999ae02330236042b0d37e3db Mon Sep 17 00:00:00 2001 From: wkj Date: Tue, 6 Apr 2004 19:06:52 +0000 Subject: Import version of libhtml that might actually work with ANSI C. --- src/libhtml/utils.c | 591 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 591 insertions(+) create mode 100644 src/libhtml/utils.c (limited to 'src/libhtml/utils.c') diff --git a/src/libhtml/utils.c b/src/libhtml/utils.c new file mode 100644 index 00000000..db22bba7 --- /dev/null +++ b/src/libhtml/utils.c @@ -0,0 +1,591 @@ +#include +#include +#include +#include +#include "impl.h" + +Rune whitespace[] = { ' ', '\t', '\n', '\r', '\0' }; +Rune notwhitespace[] = { '^', ' ', '\t', '\n', '\r' , '\0'}; + +// All lists start out like List structure. +// List itself can be used as list of int. +int +_listlen(List* l) +{ + int n = 0; + + while(l != nil) { + l = l->next; + n++; + } + return n; +} + +// Cons +List* +_newlist(int val, List* rest) +{ + List* ans; + + ans = (List*)emalloc(sizeof(List)); + ans->val = val; + ans->next = rest; + return ans; +} + +// Reverse a list in place +List* +_revlist(List* l) +{ + List* newl; + List* nextl; + + newl = nil; + while(l != nil) { + nextl = l->next; + l->next = newl; + newl = l; + l = nextl; + } + return newl; +} + +// The next few routines take a "character class" as argument. +// e.g., "a-zA-Z", or "^ \t\n" +// (ranges indicated by - except in first position; +// ^ is first position means "not in" the following class) + +// Splitl splits s[0:n] just before first character of class cl. +// Answers go in (p1, n1) and (p2, n2). +// If no split, the whole thing goes in the first component. +// Note: answers contain pointers into original string. +void +_splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2) +{ + Rune* p; + + p = _Strnclass(s, cl, n); + *p1 = s; + if(p == nil) { + *n1 = n; + *p2 = nil; + *n2 = 0; + } + else { + *p2 = p; + *n1 = p-s; + *n2 = n-*n1; + } +} + +// Splitr splits s[0:n] just after last character of class cl. +// Answers go in (p1, n1) and (p2, n2). +// If no split, the whole thing goes in the last component. +// Note: answers contain pointers into original string. +void +_splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2) +{ + Rune* p; + + p = _Strnrclass(s, cl, n); + if(p == nil) { + *p1 = nil; + *n1 = 0; + *p2 = s; + *n2 = n; + } + else { + *p1 = s; + *p2 = p+1; + *n1 = *p2-s; + *n2 = n-*n1; + } +} + +// Splitall splits s[0:n] into parts that are separated by characters from class cl. +// Each part will have nonzero length. +// At most alen parts are found, and pointers to their starts go into +// the strarr array, while their lengths go into the lenarr array. +// The return value is the number of parts found. +int +_splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen) +{ + int i; + Rune* p; + Rune* q; + Rune* slast; + + if(s == nil || n == 0) + return 0; + i = 0; + p = s; + slast = s+n; + while(p < slast && i < alen) { + while(p < slast && _inclass(*p, cl)) + p++; + if(p == slast) + break; + q = _Strnclass(p, cl, slast-p); + if(q == nil) + q = slast; + assert(q > p && q <= slast); + strarr[i] = p; + lenarr[i] = q-p; + i++; + p = q; + } + return i; +} + +// Find part of s that excludes leading and trailing whitespace, +// and return that part in *pans (and its length in *panslen). +void +_trimwhite(Rune* s, int n, Rune** pans, int* panslen) +{ + Rune* p; + Rune* q; + + p = nil; + if(n > 0) { + p = _Strnclass(s, notwhitespace, n); + if(p != nil) { + q = _Strnrclass(s, notwhitespace, n); + assert(q != nil); + n = q+1-p; + } + } + *pans = p; + *panslen = n; +} + +// _Strclass returns a pointer to the first element of s that is +// a member of class cl, nil if none. +Rune* +_Strclass(Rune* s, Rune* cl) +{ + Rune* p; + + for(p = s; *p != 0; p++) + if(_inclass(*p, cl)) + return p; + return nil; +} + +// _Strnclass returns a pointer to the first element of s[0:n] that is +// a member of class cl, nil if none. +Rune* +_Strnclass(Rune* s, Rune* cl, int n) +{ + Rune* p; + + for(p = s; n-- && *p != 0; p++) + if(_inclass(*p, cl)) + return p; + return nil; +} + +// _Strrclass returns a pointer to the last element of s that is +// a member of class cl, nil if none +Rune* +_Strrclass(Rune* s, Rune* cl) +{ + Rune* p; + + if(s == nil || *s == 0) + return nil; + p = s + runestrlen(s) - 1; + while(p >= s) { + if(_inclass(*p, cl)) + return p; + p--; + }; + return nil; +} + +// _Strnrclass returns a pointer to the last element of s[0:n] that is +// a member of class cl, nil if none +Rune* +_Strnrclass(Rune* s, Rune* cl, int n) +{ + Rune* p; + + if(s == nil || *s == 0 || n == 0) + return nil; + p = s + n - 1; + while(p >= s) { + if(_inclass(*p, cl)) + return p; + p--; + }; + return nil; +} + +// Is c in the class cl? +int +_inclass(Rune c, Rune* cl) +{ + int n; + int ans; + int negate; + int i; + + n = _Strlen(cl); + if(n == 0) + return 0; + ans = 0; + negate = 0; + if(cl[0] == '^') { + negate = 1; + cl++; + n--; + } + for(i = 0; i < n; i++) { + if(cl[i] == '-' && i > 0 && i < n - 1) { + if(c >= cl[i - 1] && c <= cl[i + 1]) { + ans = 1; + break; + } + i++; + } + else if(c == cl[i]) { + ans = 1; + break; + } + } + if(negate) + ans = !ans; + return ans; +} + +// Is pre a prefix of s? +int +_prefix(Rune* pre, Rune* s) +{ + int ns; + int n; + int k; + + ns = _Strlen(s); + n = _Strlen(pre); + if(ns < n) + return 0; + for(k = 0; k < n; k++) { + if(pre[k] != s[k]) + return 0; + } + return 1; +} + +// Number of runes in (null-terminated) s +int +_Strlen(Rune* s) +{ + if(s == nil) + return 0; + return runestrlen(s); +} + +// -1, 0, 1 as s1 is lexicographically less, equal greater than s2 +int +_Strcmp(Rune *s1, Rune *s2) +{ + if(s1 == nil) + return (s2 == nil || *s2 == 0) ? 0 : -1; + if(s2 == nil) + return (*s1 == 0) ? 0 : 1; + return runestrcmp(s1, s2); +} + +// Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars). +// Also, do a case-insensitive match, assuming s2 +// has no chars in [A-Z], only their lowercase versions. +// (This routine is used for in-place keyword lookup, where s2 is in a keyword +// list and s1 is some substring, possibly mixed-case, in a buffer.) +int +_Strncmpci(Rune *s1, int n1, Rune *s2) +{ + Rune c1, c2; + + for(;;) { + if(n1-- == 0) { + if(*s2 == 0) + return 0; + return -1; + } + c1 = *s1++; + c2 = *s2++; + if(c1 >= 'A' && c1 <= 'Z') + c1 = c1 - 'A' + 'a'; + if(c1 != c2) { + if(c1 > c2) + return 1; + return -1; + } + } +} + +// emalloc and copy +Rune* +_Strdup(Rune* s) +{ + if(s == nil) + return nil; + return _Strndup(s, runestrlen(s)); +} + +// emalloc and copy n chars of s (assume s is at least that long), +// and add 0 terminator. +// Return nil if n==0. +Rune* +_Strndup(Rune* s, int n) +{ + Rune* ans; + + if(n <= 0) + return nil; + ans = _newstr(n); + memmove(ans, s, n*sizeof(Rune)); + ans[n] = 0; + return ans; +} +// emalloc enough room for n Runes, plus 1 null terminator. +// (Not initialized to anything.) +Rune* +_newstr(int n) +{ + return (Rune*)emalloc((n+1)*sizeof(Rune)); +} + +// emalloc and copy s+t +Rune* +_Strdup2(Rune* s, Rune* t) +{ + int ns, nt; + Rune* ans; + Rune* p; + + ns = _Strlen(s); + nt = _Strlen(t); + if(ns+nt == 0) + return nil; + ans = _newstr(ns+nt); + p = _Stradd(ans, s, ns); + p = _Stradd(p, t, nt); + *p = 0; + return ans; +} + +// Return emalloc'd substring s[start:stop], +Rune* +_Strsubstr(Rune* s, int start, int stop) +{ + Rune* t; + + if(start == stop) + return nil; + t = _Strndup(s+start, stop-start); + return t; +} + +// Copy n chars to s1 from s2, and return s1+n +Rune* +_Stradd(Rune* s1, Rune* s2, int n) +{ + if(n == 0) + return s1; + memmove(s1, s2, n*sizeof(Rune)); + return s1+n; +} + +// Like strtol, but converting from Rune* string + +//#define LONG_MAX 2147483647L +//#define LONG_MIN -2147483648L + +long +_Strtol(Rune* nptr, Rune** endptr, int base) +{ + Rune* p; + long n, nn; + int c, ovfl, v, neg, ndig; + + p = nptr; + neg = 0; + n = 0; + ndig = 0; + ovfl = 0; + + /* + * White space + */ + for(;;p++){ + switch(*p){ + case ' ': + case '\t': + case '\n': + case '\f': + case '\r': + case '\v': + continue; + } + break; + } + + /* + * Sign + */ + if(*p=='-' || *p=='+') + if(*p++ == '-') + neg = 1; + + /* + * Base + */ + if(base==0){ + if(*p != '0') + base = 10; + else{ + base = 8; + if(p[1]=='x' || p[1]=='X'){ + p += 2; + base = 16; + } + } + }else if(base==16 && *p=='0'){ + if(p[1]=='x' || p[1]=='X') + p += 2; + }else if(base<0 || 36= base) + break; + nn = n*base + v; + if(nn < n) + ovfl = 1; + n = nn; + } + + Return: + if(ndig == 0) + p = nptr; + if(endptr) + *endptr = p; + if(ovfl){ + if(neg) + return LONG_MIN; + return LONG_MAX; + } + if(neg) + return -n; + return n; +} + +// Convert buf[0:n], bytes whose character set is chset, +// into a emalloc'd null-terminated Unicode string. +Rune* +toStr(uchar* buf, int n, int chset) +{ + int i; + int m; + Rune ch; + Rune* ans; + + switch(chset) { + case US_Ascii: + case ISO_8859_1: + ans = (Rune*)emalloc((n+1)*sizeof(Rune)); + for(i = 0; i < n; i++) + ans[i] = buf[i]; + ans[n] = 0; + break; + + case UTF_8: + m = 0; + for(i = 0; i < n; ) { + i += chartorune(&ch, (char*)(buf+i)); + m++; + } + ans = (Rune*)emalloc((m+1)*sizeof(Rune)); + m = 0; + for(i = 0; i < n; ) { + i += chartorune(&ch, (char*)(buf+i)); + ans[m++] = ch; + } + ans[m] = 0; + break; + + default: + ans = nil; + assert(0); + } + return ans; +} + +// Convert buf[0:n], Unicode characters, +// into an emalloc'd null-terminated string in character set chset. +// Use 0x80 for unconvertable characters. +uchar* +fromStr(Rune* buf, int n, int chset) +{ + uchar* ans; + int i, lim, m; + Rune ch; + uchar* p; + uchar s[UTFmax]; + + ans = nil; + switch(chset) { + case US_Ascii: + case ISO_8859_1: + ans = (uchar*)emalloc(n+1); + lim = (chset==US_Ascii)? 127 : 255; + for(i = 0; i < n; i++) { + ch = buf[i]; + if(ch > lim) + ch = 0x80; + ans[i] = ch; + } + ans[n] = 0; + break; + + case UTF_8: + m = 0; + for(i = 0; i < n; i++) { + m += runetochar((char*)s, &buf[i]); + } + ans = (uchar*)emalloc(m+1); + p = ans; + for(i = 0; i < n; i++) + p += runetochar((char*)p, &buf[i]); + *p = 0; + break; + + default: + assert(0); + } + return ans; + +} + +// Convert n to emalloc'd String. +Rune* +_ltoStr(int n) +{ + int m; + uchar buf[20]; + + m = snprint((char*)buf, sizeof(buf), "%d", n); + return toStr(buf, m, US_Ascii); +} -- cgit v1.2.3