From 7cf289ca89a7416999ae02330236042b0d37e3db Mon Sep 17 00:00:00 2001 From: wkj Date: Tue, 6 Apr 2004 19:06:52 +0000 Subject: Import version of libhtml that might actually work with ANSI C. --- src/libhtml/lex.c | 1384 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1384 insertions(+) create mode 100644 src/libhtml/lex.c (limited to 'src/libhtml/lex.c') diff --git a/src/libhtml/lex.c b/src/libhtml/lex.c new file mode 100644 index 00000000..99c5fc12 --- /dev/null +++ b/src/libhtml/lex.c @@ -0,0 +1,1384 @@ +#include +#include +#include +#include +#include +#include "impl.h" + +typedef struct TokenSource TokenSource; +struct TokenSource +{ + int i; // index of next byte to use + uchar* data; // all the data + int edata; // data[0:edata] is valid + int chset; // one of US_Ascii, etc. + int mtype; // TextHtml or TextPlain +}; + +enum { + EOF = -2, + EOB = -1 +}; + +#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) + +#define SMALLBUFSIZE 240 +#define BIGBUFSIZE 2000 + +// HTML 4.0 tag names. +// Keep sorted, and in correspondence with enum in iparse.h. +Rune **tagnames; +char *_tagnames[] = { + " ", + "!", + "a", + "abbr", + "acronym", + "address", + "applet", + "area", + "b", + "base", + "basefont", + "bdo", + "big", + "blink", + "blockquote", + "body", + "bq", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "isindex", + "kbd", + "label", + "legend", + "li", + "link", + "map", + "menu", + "meta", + "nobr", + "noframes", + "noscript", + "object", + "ol", + "optgroup", + "option", + "p", + "param", + "pre", + "q", + "s", + "samp", + "script", + "select", + "small", + "span", + "strike", + "strong", + "style", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "title", + "tr", + "tt", + "u", + "ul", + "var" +}; + +// HTML 4.0 attribute names. +// Keep sorted, and in correspondence with enum in i.h. +Rune **attrnames; +char* _attrnames[] = { + "abbr", + "accept-charset", + "access-key", + "action", + "align", + "alink", + "alt", + "archive", + "axis", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "classid", + "clear", + "code", + "codebase", + "codetype", + "color", + "cols", + "colspan", + "compact", + "content", + "coords", + "data", + "datetime", + "declare", + "defer", + "dir", + "disabled", + "enctype", + "face", + "for", + "frame", + "frameborder", + "headers", + "height", + "href", + "hreflang", + "hspace", + "http-equiv", + "id", + "ismap", + "label", + "lang", + "link", + "longdesc", + "marginheight", + "marginwidth", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noresize", + "noshade", + "nowrap", + "object", + "onblur", + "onchange", + "onclick", + "ondblclick", + "onfocus", + "onkeypress", + "onkeyup", + "onload", + "onmousedown", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onreset", + "onselect", + "onsubmit", + "onunload", + "profile", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scheme", + "scope", + "scrolling", + "selected", + "shape", + "size", + "span", + "src", + "standby", + "start", + "style", + "summary", + "tabindex", + "target", + "text", + "title", + "type", + "usemap", + "valign", + "value", + "valuetype", + "version", + "vlink", + "vspace", + "width" +}; + + +// Character entity to unicode character number map. +// Keep sorted by name. +StringInt *chartab; +AsciiInt _chartab[142] = { + {"AElig", 198}, + {"Aacute", 193}, + {"Acirc", 194}, + {"Agrave", 192}, + {"Aring", 197}, + {"Atilde", 195}, + {"Auml", 196}, + {"Ccedil", 199}, + {"ETH", 208}, + {"Eacute", 201}, + {"Ecirc", 202}, + {"Egrave", 200}, + {"Euml", 203}, + {"Iacute", 205}, + {"Icirc", 206}, + {"Igrave", 204}, + {"Iuml", 207}, + {"Ntilde", 209}, + {"Oacute", 211}, + {"Ocirc", 212}, + {"Ograve", 210}, + {"Oslash", 216}, + {"Otilde", 213}, + {"Ouml", 214}, + {"THORN", 222}, + {"Uacute", 218}, + {"Ucirc", 219}, + {"Ugrave", 217}, + {"Uuml", 220}, + {"Yacute", 221}, + {"aacute", 225}, + {"acirc", 226}, + {"acute", 180}, + {"aelig", 230}, + {"agrave", 224}, + {"alpha", 945}, + {"amp", 38}, + {"aring", 229}, + {"atilde", 227}, + {"auml", 228}, + {"beta", 946}, + {"brvbar", 166}, + {"ccedil", 231}, + {"cdots", 8943}, + {"cedil", 184}, + {"cent", 162}, + {"chi", 967}, + {"copy", 169}, + {"curren", 164}, + {"ddots", 8945}, + {"deg", 176}, + {"delta", 948}, + {"divide", 247}, + {"eacute", 233}, + {"ecirc", 234}, + {"egrave", 232}, + {"emdash", 8212}, + {"emsp", 8195}, + {"endash", 8211}, + {"ensp", 8194}, + {"epsilon", 949}, + {"eta", 951}, + {"eth", 240}, + {"euml", 235}, + {"frac12", 189}, + {"frac14", 188}, + {"frac34", 190}, + {"gamma", 947}, + {"gt", 62}, + {"iacute", 237}, + {"icirc", 238}, + {"iexcl", 161}, + {"igrave", 236}, + {"iota", 953}, + {"iquest", 191}, + {"iuml", 239}, + {"kappa", 954}, + {"lambda", 955}, + {"laquo", 171}, + {"ldots", 8230}, + {"lt", 60}, + {"macr", 175}, + {"micro", 181}, + {"middot", 183}, + {"mu", 956}, + {"nbsp", 160}, + {"not", 172}, + {"ntilde", 241}, + {"nu", 957}, + {"oacute", 243}, + {"ocirc", 244}, + {"ograve", 242}, + {"omega", 969}, + {"omicron", 959}, + {"ordf", 170}, + {"ordm", 186}, + {"oslash", 248}, + {"otilde", 245}, + {"ouml", 246}, + {"para", 182}, + {"phi", 966}, + {"pi", 960}, + {"plusmn", 177}, + {"pound", 163}, + {"psi", 968}, + {"quad", 8193}, + {"quot", 34}, + {"raquo", 187}, + {"reg", 174}, + {"rho", 961}, + {"sect", 167}, + {"shy", 173}, + {"sigma", 963}, + {"sp", 8194}, + {"sup1", 185}, + {"sup2", 178}, + {"sup3", 179}, + {"szlig", 223}, + {"tau", 964}, + {"theta", 952}, + {"thinsp", 8201}, + {"thorn", 254}, + {"times", 215}, + {"trade", 8482}, + {"uacute", 250}, + {"ucirc", 251}, + {"ugrave", 249}, + {"uml", 168}, + {"upsilon", 965}, + {"uuml", 252}, + {"varepsilon", 8712}, + {"varphi", 981}, + {"varpi", 982}, + {"varrho", 1009}, + {"vdots", 8942}, + {"vsigma", 962}, + {"vtheta", 977}, + {"xi", 958}, + {"yacute", 253}, + {"yen", 165}, + {"yuml", 255}, + {"zeta", 950} +}; +#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0])) + +// Characters Winstart..Winend are those that Windows +// uses interpolated into the Latin1 set. +// They aren't supposed to appear in HTML, but they do.... +enum { + Winstart = 127, + Winend = 159 +}; + +static int winchars[]= { 8226, // 8226 is a bullet + 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 352, 8249, 338, 8226, 8226, 8226, + 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 353, 8250, 339, 8226, 8226, 376}; + +static StringInt* tagtable; // initialized from tagnames +static StringInt* attrtable; // initialized from attrnames + +static void lexinit(); +static int getplaindata(TokenSource* ts, Token* a, int* pai); +static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int gettag(TokenSource* ts, int starti, Token* a, int* pai); +static Rune* buftostr(Rune* s, Rune* buf, int j); +static int comment(TokenSource* ts); +static int findstr(TokenSource* ts, Rune* s); +static int ampersand(TokenSource* ts); +//static int lowerc(int c); +static int getchar(TokenSource* ts); +static void ungetchar(TokenSource* ts, int c); +static void backup(TokenSource* ts, int savei); +//static void freeinsidetoken(Token* t); +static void freeattrs(Attr* ahead); +static Attr* newattr(int attid, Rune* value, Attr* link); +static int Tconv(Fmt* f); + +int dbglex = 0; +static int lexinited = 0; + +static void +lexinit(void) +{ + chartab = cvtstringinttab(_chartab, nelem(_chartab)); + tagnames = cvtstringtab(_tagnames, nelem(_tagnames)); + tagtable = _makestrinttab(tagnames, Numtags); + attrnames = cvtstringtab(_attrnames, nelem(_attrnames)); + attrtable = _makestrinttab(attrnames, Numattrs); + fmtinstall('T', Tconv); + lexinited = 1; +} + +static TokenSource* +newtokensource(uchar* data, int edata, int chset, int mtype) +{ + TokenSource* ans; + + assert(chset == US_Ascii || chset == ISO_8859_1 || + chset == UTF_8 || chset == Unicode); + ans = (TokenSource*)emalloc(sizeof(TokenSource)); + ans->i = 0; + ans->data = data; + ans->edata = edata; + ans->chset = chset; + ans->mtype = mtype; + return ans; +} + +enum { + ToksChunk = 500 +}; + +// Call this to get the tokens. +// The number of returned tokens is returned in *plen. +Token* +_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) +{ + TokenSource* ts; + Token* a; + int alen; + int ai; + int starti; + int c; + int tag; + + if(!lexinited) + lexinit(); + ts = newtokensource(data, datalen, chset, mtype); + alen = ToksChunk; + a = (Token*)emalloc(alen * sizeof(Token)); + ai = 0; + if(dbglex) + fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); + if(ts->mtype == TextHtml) { + for(;;) { + if(ai == alen) { + a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); + alen += ToksChunk; + } + starti = ts->i; + c = getchar(ts); + if(c < 0) + break; + if(c == '<') { + tag = gettag(ts, starti, a, &ai); + if(tag == Tscript) { + // special rules for getting Data after.... + starti = ts->i; + c = getchar(ts); + tag = getscriptdata(ts, c, starti, a, &ai); + } + } + else + tag = getdata(ts, c, starti, a, &ai); + if(tag == -1) + break; + else if(dbglex > 1 && tag != Comment) + fprint(2, "lex: got token %T\n", &a[ai-1]); + } + } + else { + // plain text (non-html) tokens + for(;;) { + if(ai == alen) { + a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); + alen += ToksChunk; + } + tag = getplaindata(ts, a, &ai); + if(tag == -1) + break; + if(dbglex > 1) + fprint(2, "lex: got token %T\n", &a[ai]); + } + } + if(dbglex) + fprint(2, "lex: returning %d tokens\n", ai); + *plen = ai; + if(ai == 0) + return nil; + return a; +} + +// For case where source isn't HTML. +// Just make data tokens, one per line (or partial line, +// at end of buffer), ignoring non-whitespace control +// characters and dumping \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getplaindata(TokenSource* ts, Token* a, int* pai) +{ + Rune* s; + int j; + int starti; + int c; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + starti = ts->i; + for(c = getchar(ts); c >= 0; c = getchar(ts)) { + if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else + c = 0; + } + if(c != 0) { + buf[j++] = c; + if(j == sizeof(buf)-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + if(c == '\n') + break; + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// Return concatenation of s and buf[0:j] +static Rune* +buftostr(Rune* s, Rune* buf, int j) +{ + buf[j] = 0; + if(s == nil) + s = _Strndup(buf, j); + else + s = _Strdup2(s, buf); + return s; +} + +// Gather data up to next start-of-tag or end-of-buffer. +// Translate entity references (&). +// Ignore non-whitespace control characters and get rid of \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +{ + Rune* s; + int j; + int c; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + c = firstc; + while(c >= 0) { + if(c == '&') { + c = ampersand(ts); + if(c < 0) + break; + } + else if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else { + if(warn) + fprint(2, "warning: non-whitespace control character %d ignored\n", c); + c = 0; + } + } + else if(c == '<') { + ungetchar(ts, c); + break; + } + if(c != 0) { + buf[j++] = c; + if(j == BIGBUFSIZE-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + c = getchar(ts); + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// The rules for lexing scripts are different (ugh). +// Gather up everything until see a . +static int +getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +{ + Rune* s; + int j; + int tstarti; + int savei; + int c; + int tag; + int done; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + tstarti = starti; + c = firstc; + done = 0; + while(c >= 0) { + if(c == '<') { + // other browsers ignore stuff to end of line after i; + c = getchar(ts); + if(c == '!') { + while(c >= 0 && c != '\n' && c != '\r') + c = getchar(ts); + if(c == '\r') + c = getchar(ts); + if(c == '\n') + c = getchar(ts); + } + else if(c >= 0) { + backup(ts, savei); + tag = gettag(ts, tstarti, a, pai); + if(tag == -1) + break; + if(tag != Comment) + (*pai)--; + backup(ts, tstarti); + if(tag == Tscript + RBRA) { + done = 1; + break; + } + // here tag was not , so take as regular data + c = getchar(ts); + } + } + if(c < 0) + break; + if(c != 0) { + buf[j++] = c; + if(j == BIGBUFSIZE-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + tstarti = ts->i; + c = getchar(ts); + } + if(done || ts->i == ts->edata) { + s = buftostr(s, buf, j); + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; + } + backup(ts, starti); + return -1; +} + +// We've just seen a '<'. Gather up stuff to closing '>' (if buffer +// ends before then, return -1). +// If it's a tag, look up the name, gather the attributes, and return +// the appropriate token. +// Else it's either just plain data or some kind of ignorable stuff: +// return Data or Comment as appropriate. +// If it's not a Comment, put it in a[*pai] and bump *pai. +static int +gettag(TokenSource* ts, int starti, Token* a, int* pai) +{ + int rbra; + int ans; + Attr* al; + int nexti; + int c; + int ti; + int afnd; + int attid; + int quote; + Rune* val; + int nv; + int i; + int tag; + Token* tok; + Rune buf[BIGBUFSIZE]; + + rbra = 0; + nexti = ts->i; + tok = &a[*pai]; + tok->tag = Notfound; + tok->text = nil; + tok->attr = nil; + tok->starti = starti; + c = getchar(ts); + if(c == '/') { + rbra = RBRA; + c = getchar(ts); + } + if(c < 0) + goto eob_done; + if(c >= 256 || !isalpha(c)) { + // not a tag + if(c == '!') { + ans = comment(ts); + if(ans != -1) + return ans; + goto eob_done; + } + else { + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L(Llt)); + (*pai)++; + return Data; + } + } + // c starts a tagname + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + // if name is bigger than buf it won't be found anyway... + if(i < BIGBUFSIZE) + buf[i++] = c; + } + if(_lookup(tagtable, Numtags, buf, i, &tag)) + tok->tag = tag + rbra; + else + tok->text = _Strndup(buf, i); // for warning print, in build + + // attribute gathering loop + al = nil; + while(1) { + // look for "ws name" or "ws name ws = ws val" (ws=whitespace) + // skip whitespace +attrloop_continue: + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c == '>') + goto attrloop_done; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, c); + goto attrloop_done; + } + if(c >= 256 || !isalpha(c)) { + if(warn) + fprint(2, "warning: expected attribute name\n"); + // skipt to next attribute name + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c < 256 && isalpha(c)) + goto attrloop_continue; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, 60); + goto attrloop_done; + } + if(c == '>') + goto attrloop_done; + } + } + // gather attribute name + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + if(i < BIGBUFSIZE-1) + buf[i++] = c; + } + afnd = _lookup(attrtable, Numattrs, buf, i, &attid); + if(warn && !afnd) { + buf[i] = 0; + fprint(2, "warning: unknown attribute name %S\n", buf); + } + // skip whitespace + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c != '=') { + if(afnd) + al = newattr(attid, nil, al); + goto attrloop_continue; + } + //# c is '=' here; skip whitespace + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c >= 256 || !isspace(c)) + break; + } + quote = 0; + if(c == '\'' || c == '"') { + quote = c; + c = getchar(ts); + if(c < 0) + goto eob_done; + } + val = nil; + nv = 0; + while(1) { +valloop_continue: + if(c < 0) + goto eob_done; + if(c == '>') { + if(quote) { + // c might be part of string (though not good style) + // but if line ends before close quote, assume + // there was an unmatched quote + ti = ts->i; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c == quote) { + backup(ts, ti); + buf[nv++] = '>'; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + goto valloop_continue; + } + if(c == '\n') { + if(warn) + fprint(2, "warning: apparent unmatched quote\n"); + backup(ts, ti); + c = '>'; + goto valloop_done; + } + } + } + else + goto valloop_done; + } + if(quote) { + if(c == quote) { + c = getchar(ts); + if(c < 0) + goto eob_done; + goto valloop_done; + } + if(c == '\r') { + c = getchar(ts); + goto valloop_continue; + } + if(c == '\t' || c == '\n') + c = ' '; + } + else { + if(c < 256 && isspace(c)) + goto valloop_done; + } + if(c == '&') { + c = ampersand(ts); + if(c == -1) + goto eob_done; + } + buf[nv++] = c; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + } +valloop_done: + if(afnd) { + val = buftostr(val, buf, nv); + al = newattr(attid, val, al); + } + } + +attrloop_done: + tok->attr = al; + (*pai)++; + return tok->tag; + +eob_done: + if(warn) + fprint(2, "warning: incomplete tag at end of page\n"); + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L(Llt)); + return Data; +} + +// We've just read a ' to close, or if none, > to close. +// If see to close. +// If no close before end of file, leave original characters in as literal data. +// +// If we see ignorable stuff, return Comment. +// Else return nil (caller should back up and try again when more data arrives, +// unless at end of file, in which case caller should just make '<' a data token). +static int +comment(TokenSource* ts) +{ + int nexti; + int havecomment; + int c; + + nexti = ts->i; + havecomment = 0; + c = getchar(ts); + if(c == '-') { + c = getchar(ts); + if(c == '-') { + if(findstr(ts, L(Larrow))) + havecomment = 1; + else + backup(ts, nexti); + } + } + if(!havecomment) { + if(c == '>') + havecomment = 1; + else if(c >= 0) { + if(findstr(ts, L(Lgt))) + havecomment = 1; + } + } + if(havecomment) + return Comment; + return -1; +} + +// Look for string s in token source. +// If found, return 1, with buffer at next char after s, +// else return 0 (caller should back up). +static int +findstr(TokenSource* ts, Rune* s) +{ + int c0; + int n; + int nexti; + int i; + int c; + + c0 = s[0]; + n = runestrlen(s); + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(c == c0) { + if(n == 1) + return 1; + nexti = ts->i; + for(i = 1; i < n; i++) { + c = getchar(ts); + if(c < 0) + goto mainloop_done; + if(c != s[i]) + break; + } + if(i == n) + return 1; + backup(ts, nexti); + } + } +mainloop_done: + return 0; +} + +// We've just read an '&'; look for an entity reference +// name, and if found, return translated char. +// if there is a complete entity name but it isn't known, +// try prefixes (gets around some buggy HTML out there), +// and if that fails, back up to just past the '&' and return '&'. +// If the entity can't be completed in the current buffer, back up +// to the '&' and return -1. +static int +ampersand(TokenSource* ts) +{ + int savei; + int c; + int fnd; + int ans; + int v; + int i; + int k; + Rune buf[SMALLBUFSIZE]; + + savei = ts->i; + c = getchar(ts); + fnd = 0; + ans = -1; + if(c == '#') { + c = getchar(ts); + v = 0; + while(c >= 0) { + if(!(c < 256 && isdigit(c))) + break; + v = v*10 + c - 48; + c = getchar(ts); + } + if(c >= 0) { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + c = v; + if(c == 160) + c = 160; + if(c >= Winstart && c <= Winend) { + c = winchars[c - Winstart]; + } + ans = c; + fnd = 1; + } + } + else if(c < 256 && isalpha(c)) { + buf[0] = c; + k = 1; + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(ISNAMCHAR(c)) { + if(k < SMALLBUFSIZE-1) + buf[k++] = c; + } + else { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + break; + } + } + if(c >= 0) { + fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); + if(!fnd) { + // Try prefixes of s + if(c == ';' || c == '\n' || c == '\r') + ungetchar(ts, c); + i = k; + while(--k > 0) { + fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); + if(fnd) { + while(i > k) { + i--; + ungetchar(ts, buf[i]); + } + break; + } + } + } + } + } + if(!fnd) { + backup(ts, savei); + ans = '&'; + } + return ans; +} + +// Get next char, obeying ts.chset. +// Returns -1 if no complete character left before current end of data. +static int +getchar(TokenSource* ts) +{ + uchar* buf; + int c; + int n; + int ok; + Rune r; + + if(ts->i >= ts->edata) + return -1; + buf = ts->data; + c = buf[ts->i]; + switch(ts->chset) { + case ISO_8859_1: + if(c >= Winstart && c <= Winend) + c = winchars[c - Winstart]; + ts->i++; + break; + case US_Ascii: + if(c > 127) { + if(warn) + fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); + } + ts->i++; + break; + case UTF_8: + ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); + n = chartorune(&r, (char*)(buf+ts->i)); + if(ok) { + if(warn && c == 0x80) + fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); + ts->i += n; + c = r; + } + else { + // not enough bytes in buf to complete utf-8 char + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + case Unicode: + if(ts->i < ts->edata - 1) { + //standards say most-significant byte first + c = (c << 8)|(buf[ts->i + 1]); + ts->i += 2; + } + else { + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + } + return c; +} + +// Assuming c was the last character returned by getchar, set +// things up so that next getchar will get that same character +// followed by the current 'next character', etc. +static void +ungetchar(TokenSource* ts, int c) +{ + int n; + Rune r; + char a[UTFmax]; + + n = 1; + switch(ts->chset) { + case UTF_8: + if(c >= 128) { + r = c; + n = runetochar(a, &r); + } + break; + case Unicode: + n = 2; + break; + } + ts->i -= n; +} + +// Restore ts so that it is at the state where the index was savei. +static void +backup(TokenSource* ts, int savei) +{ + if(dbglex) + fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); + ts->i = savei; +} + + +// Look for value associated with attribute attid in token t. +// If there is one, return 1 and put the value in *pans, +// else return 0. +// If xfer is true, transfer ownership of the string to the caller +// (nil it out here); otherwise, caller must duplicate the answer +// if it needs to save it. +// OK to have pans==0, in which case this is just looking +// to see if token is present. +int +_tokaval(Token* t, int attid, Rune** pans, int xfer) +{ + Attr* attr; + + attr = t->attr; + while(attr != nil) { + if(attr->attid == attid) { + if(pans != nil) + *pans = attr->value; + if(xfer) + attr->value = nil; + return 1; + } + attr = attr->next; + } + if(pans != nil) + *pans = nil; + return 0; +} + +static int +Tconv(Fmt *f) +{ + Token* t; + int i; + int tag; + char* srbra; + Rune* aname; + Rune* tname; + Attr* a; + char buf[BIGBUFSIZE]; + + t = va_arg(f->args, Token*); + if(t == nil) + sprint(buf, ""); + else { + i = 0; + if(dbglex > 1) + i = snprint(buf, sizeof(buf), "[%d]", t->starti); + tag = t->tag; + if(tag == Data) { + i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); + } + else { + srbra = ""; + if(tag >= RBRA) { + tag -= RBRA; + srbra = "/"; + } + tname = tagnames[tag]; + if(tag == Notfound) + tname = L(Lquestion); + i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); + for(a = t->attr; a != nil; a = a->next) { + aname = attrnames[a->attid]; + i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); + if(a->value != nil) + i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); + } + i += snprint(buf+i, sizeof(buf)-i-1, ">"); + } + buf[i] = 0; + } + return fmtstrcpy(f, buf); +} + +// Attrs own their constituent strings, but build may eventually +// transfer some values to its items and nil them out in the Attr. +static Attr* +newattr(int attid, Rune* value, Attr* link) +{ + Attr* ans; + + ans = (Attr*)emalloc(sizeof(Attr)); + ans->attid = attid; + ans->value = value; + ans->next = link; + return ans; +} + +// Free list of Attrs linked through next field +static void +freeattrs(Attr* ahead) +{ + Attr* a; + Attr* nexta; + + a = ahead; + while(a != nil) { + nexta = a->next; + free(a->value); + free(a); + a = nexta; + } +} + +// Free array of Tokens. +// Allocated space might have room for more than n tokens, +// but only n of them are initialized. +// If caller has transferred ownership of constitutent strings +// or attributes, it must have nil'd out the pointers in the Tokens. +void +_freetokens(Token* tarray, int n) +{ + int i; + Token* t; + + if(tarray == nil) + return; + for(i = 0; i < n; i++) { + t = &tarray[i]; + free(t->text); + freeattrs(t->attr); + } + free(tarray); +} -- cgit v1.2.3