aboutsummaryrefslogtreecommitdiff
path: root/src/libhtml/lex.c
diff options
context:
space:
mode:
authorwkj <devnull@localhost>2004-04-06 19:06:52 +0000
committerwkj <devnull@localhost>2004-04-06 19:06:52 +0000
commit7cf289ca89a7416999ae02330236042b0d37e3db (patch)
tree796d1363a7a53c72c28b199758ee674f1326a510 /src/libhtml/lex.c
parent3e3817f7c86658f60715dd93768eaf8285807985 (diff)
downloadplan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.gz
plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.bz2
plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.zip
Import version of libhtml that might actually work with ANSI C.
Diffstat (limited to 'src/libhtml/lex.c')
-rw-r--r--src/libhtml/lex.c1384
1 files changed, 1384 insertions, 0 deletions
diff --git a/src/libhtml/lex.c b/src/libhtml/lex.c
new file mode 100644
index 00000000..99c5fc12
--- /dev/null
+++ b/src/libhtml/lex.c
@@ -0,0 +1,1384 @@
+#include <u.h>
+#include <libc.h>
+#include <draw.h>
+#include <ctype.h>
+#include <html.h>
+#include "impl.h"
+
+typedef struct TokenSource TokenSource;
+struct TokenSource
+{
+ int i; // index of next byte to use
+ uchar* data; // all the data
+ int edata; // data[0:edata] is valid
+ int chset; // one of US_Ascii, etc.
+ int mtype; // TextHtml or TextPlain
+};
+
+enum {
+ EOF = -2,
+ EOB = -1
+};
+
+#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
+
+#define SMALLBUFSIZE 240
+#define BIGBUFSIZE 2000
+
+// HTML 4.0 tag names.
+// Keep sorted, and in correspondence with enum in iparse.h.
+Rune **tagnames;
+char *_tagnames[] = {
+ " ",
+ "!",
+ "a",
+ "abbr",
+ "acronym",
+ "address",
+ "applet",
+ "area",
+ "b",
+ "base",
+ "basefont",
+ "bdo",
+ "big",
+ "blink",
+ "blockquote",
+ "body",
+ "bq",
+ "br",
+ "button",
+ "caption",
+ "center",
+ "cite",
+ "code",
+ "col",
+ "colgroup",
+ "dd",
+ "del",
+ "dfn",
+ "dir",
+ "div",
+ "dl",
+ "dt",
+ "em",
+ "fieldset",
+ "font",
+ "form",
+ "frame",
+ "frameset",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "head",
+ "hr",
+ "html",
+ "i",
+ "iframe",
+ "img",
+ "input",
+ "ins",
+ "isindex",
+ "kbd",
+ "label",
+ "legend",
+ "li",
+ "link",
+ "map",
+ "menu",
+ "meta",
+ "nobr",
+ "noframes",
+ "noscript",
+ "object",
+ "ol",
+ "optgroup",
+ "option",
+ "p",
+ "param",
+ "pre",
+ "q",
+ "s",
+ "samp",
+ "script",
+ "select",
+ "small",
+ "span",
+ "strike",
+ "strong",
+ "style",
+ "sub",
+ "sup",
+ "table",
+ "tbody",
+ "td",
+ "textarea",
+ "tfoot",
+ "th",
+ "thead",
+ "title",
+ "tr",
+ "tt",
+ "u",
+ "ul",
+ "var"
+};
+
+// HTML 4.0 attribute names.
+// Keep sorted, and in correspondence with enum in i.h.
+Rune **attrnames;
+char* _attrnames[] = {
+ "abbr",
+ "accept-charset",
+ "access-key",
+ "action",
+ "align",
+ "alink",
+ "alt",
+ "archive",
+ "axis",
+ "background",
+ "bgcolor",
+ "border",
+ "cellpadding",
+ "cellspacing",
+ "char",
+ "charoff",
+ "charset",
+ "checked",
+ "cite",
+ "class",
+ "classid",
+ "clear",
+ "code",
+ "codebase",
+ "codetype",
+ "color",
+ "cols",
+ "colspan",
+ "compact",
+ "content",
+ "coords",
+ "data",
+ "datetime",
+ "declare",
+ "defer",
+ "dir",
+ "disabled",
+ "enctype",
+ "face",
+ "for",
+ "frame",
+ "frameborder",
+ "headers",
+ "height",
+ "href",
+ "hreflang",
+ "hspace",
+ "http-equiv",
+ "id",
+ "ismap",
+ "label",
+ "lang",
+ "link",
+ "longdesc",
+ "marginheight",
+ "marginwidth",
+ "maxlength",
+ "media",
+ "method",
+ "multiple",
+ "name",
+ "nohref",
+ "noresize",
+ "noshade",
+ "nowrap",
+ "object",
+ "onblur",
+ "onchange",
+ "onclick",
+ "ondblclick",
+ "onfocus",
+ "onkeypress",
+ "onkeyup",
+ "onload",
+ "onmousedown",
+ "onmousemove",
+ "onmouseout",
+ "onmouseover",
+ "onmouseup",
+ "onreset",
+ "onselect",
+ "onsubmit",
+ "onunload",
+ "profile",
+ "prompt",
+ "readonly",
+ "rel",
+ "rev",
+ "rows",
+ "rowspan",
+ "rules",
+ "scheme",
+ "scope",
+ "scrolling",
+ "selected",
+ "shape",
+ "size",
+ "span",
+ "src",
+ "standby",
+ "start",
+ "style",
+ "summary",
+ "tabindex",
+ "target",
+ "text",
+ "title",
+ "type",
+ "usemap",
+ "valign",
+ "value",
+ "valuetype",
+ "version",
+ "vlink",
+ "vspace",
+ "width"
+};
+
+
+// Character entity to unicode character number map.
+// Keep sorted by name.
+StringInt *chartab;
+AsciiInt _chartab[142] = {
+ {"AElig", 198},
+ {"Aacute", 193},
+ {"Acirc", 194},
+ {"Agrave", 192},
+ {"Aring", 197},
+ {"Atilde", 195},
+ {"Auml", 196},
+ {"Ccedil", 199},
+ {"ETH", 208},
+ {"Eacute", 201},
+ {"Ecirc", 202},
+ {"Egrave", 200},
+ {"Euml", 203},
+ {"Iacute", 205},
+ {"Icirc", 206},
+ {"Igrave", 204},
+ {"Iuml", 207},
+ {"Ntilde", 209},
+ {"Oacute", 211},
+ {"Ocirc", 212},
+ {"Ograve", 210},
+ {"Oslash", 216},
+ {"Otilde", 213},
+ {"Ouml", 214},
+ {"THORN", 222},
+ {"Uacute", 218},
+ {"Ucirc", 219},
+ {"Ugrave", 217},
+ {"Uuml", 220},
+ {"Yacute", 221},
+ {"aacute", 225},
+ {"acirc", 226},
+ {"acute", 180},
+ {"aelig", 230},
+ {"agrave", 224},
+ {"alpha", 945},
+ {"amp", 38},
+ {"aring", 229},
+ {"atilde", 227},
+ {"auml", 228},
+ {"beta", 946},
+ {"brvbar", 166},
+ {"ccedil", 231},
+ {"cdots", 8943},
+ {"cedil", 184},
+ {"cent", 162},
+ {"chi", 967},
+ {"copy", 169},
+ {"curren", 164},
+ {"ddots", 8945},
+ {"deg", 176},
+ {"delta", 948},
+ {"divide", 247},
+ {"eacute", 233},
+ {"ecirc", 234},
+ {"egrave", 232},
+ {"emdash", 8212},
+ {"emsp", 8195},
+ {"endash", 8211},
+ {"ensp", 8194},
+ {"epsilon", 949},
+ {"eta", 951},
+ {"eth", 240},
+ {"euml", 235},
+ {"frac12", 189},
+ {"frac14", 188},
+ {"frac34", 190},
+ {"gamma", 947},
+ {"gt", 62},
+ {"iacute", 237},
+ {"icirc", 238},
+ {"iexcl", 161},
+ {"igrave", 236},
+ {"iota", 953},
+ {"iquest", 191},
+ {"iuml", 239},
+ {"kappa", 954},
+ {"lambda", 955},
+ {"laquo", 171},
+ {"ldots", 8230},
+ {"lt", 60},
+ {"macr", 175},
+ {"micro", 181},
+ {"middot", 183},
+ {"mu", 956},
+ {"nbsp", 160},
+ {"not", 172},
+ {"ntilde", 241},
+ {"nu", 957},
+ {"oacute", 243},
+ {"ocirc", 244},
+ {"ograve", 242},
+ {"omega", 969},
+ {"omicron", 959},
+ {"ordf", 170},
+ {"ordm", 186},
+ {"oslash", 248},
+ {"otilde", 245},
+ {"ouml", 246},
+ {"para", 182},
+ {"phi", 966},
+ {"pi", 960},
+ {"plusmn", 177},
+ {"pound", 163},
+ {"psi", 968},
+ {"quad", 8193},
+ {"quot", 34},
+ {"raquo", 187},
+ {"reg", 174},
+ {"rho", 961},
+ {"sect", 167},
+ {"shy", 173},
+ {"sigma", 963},
+ {"sp", 8194},
+ {"sup1", 185},
+ {"sup2", 178},
+ {"sup3", 179},
+ {"szlig", 223},
+ {"tau", 964},
+ {"theta", 952},
+ {"thinsp", 8201},
+ {"thorn", 254},
+ {"times", 215},
+ {"trade", 8482},
+ {"uacute", 250},
+ {"ucirc", 251},
+ {"ugrave", 249},
+ {"uml", 168},
+ {"upsilon", 965},
+ {"uuml", 252},
+ {"varepsilon", 8712},
+ {"varphi", 981},
+ {"varpi", 982},
+ {"varrho", 1009},
+ {"vdots", 8942},
+ {"vsigma", 962},
+ {"vtheta", 977},
+ {"xi", 958},
+ {"yacute", 253},
+ {"yen", 165},
+ {"yuml", 255},
+ {"zeta", 950}
+};
+#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
+
+// Characters Winstart..Winend are those that Windows
+// uses interpolated into the Latin1 set.
+// They aren't supposed to appear in HTML, but they do....
+enum {
+ Winstart = 127,
+ Winend = 159
+};
+
+static int winchars[]= { 8226, // 8226 is a bullet
+ 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
+ 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
+ 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
+ 732, 8482, 353, 8250, 339, 8226, 8226, 376};
+
+static StringInt* tagtable; // initialized from tagnames
+static StringInt* attrtable; // initialized from attrnames
+
+static void lexinit();
+static int getplaindata(TokenSource* ts, Token* a, int* pai);
+static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
+static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
+static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
+static Rune* buftostr(Rune* s, Rune* buf, int j);
+static int comment(TokenSource* ts);
+static int findstr(TokenSource* ts, Rune* s);
+static int ampersand(TokenSource* ts);
+//static int lowerc(int c);
+static int getchar(TokenSource* ts);
+static void ungetchar(TokenSource* ts, int c);
+static void backup(TokenSource* ts, int savei);
+//static void freeinsidetoken(Token* t);
+static void freeattrs(Attr* ahead);
+static Attr* newattr(int attid, Rune* value, Attr* link);
+static int Tconv(Fmt* f);
+
+int dbglex = 0;
+static int lexinited = 0;
+
+static void
+lexinit(void)
+{
+ chartab = cvtstringinttab(_chartab, nelem(_chartab));
+ tagnames = cvtstringtab(_tagnames, nelem(_tagnames));
+ tagtable = _makestrinttab(tagnames, Numtags);
+ attrnames = cvtstringtab(_attrnames, nelem(_attrnames));
+ attrtable = _makestrinttab(attrnames, Numattrs);
+ fmtinstall('T', Tconv);
+ lexinited = 1;
+}
+
+static TokenSource*
+newtokensource(uchar* data, int edata, int chset, int mtype)
+{
+ TokenSource* ans;
+
+ assert(chset == US_Ascii || chset == ISO_8859_1 ||
+ chset == UTF_8 || chset == Unicode);
+ ans = (TokenSource*)emalloc(sizeof(TokenSource));
+ ans->i = 0;
+ ans->data = data;
+ ans->edata = edata;
+ ans->chset = chset;
+ ans->mtype = mtype;
+ return ans;
+}
+
+enum {
+ ToksChunk = 500
+};
+
+// Call this to get the tokens.
+// The number of returned tokens is returned in *plen.
+Token*
+_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
+{
+ TokenSource* ts;
+ Token* a;
+ int alen;
+ int ai;
+ int starti;
+ int c;
+ int tag;
+
+ if(!lexinited)
+ lexinit();
+ ts = newtokensource(data, datalen, chset, mtype);
+ alen = ToksChunk;
+ a = (Token*)emalloc(alen * sizeof(Token));
+ ai = 0;
+ if(dbglex)
+ fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
+ if(ts->mtype == TextHtml) {
+ for(;;) {
+ if(ai == alen) {
+ a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
+ alen += ToksChunk;
+ }
+ starti = ts->i;
+ c = getchar(ts);
+ if(c < 0)
+ break;
+ if(c == '<') {
+ tag = gettag(ts, starti, a, &ai);
+ if(tag == Tscript) {
+ // special rules for getting Data after....
+ starti = ts->i;
+ c = getchar(ts);
+ tag = getscriptdata(ts, c, starti, a, &ai);
+ }
+ }
+ else
+ tag = getdata(ts, c, starti, a, &ai);
+ if(tag == -1)
+ break;
+ else if(dbglex > 1 && tag != Comment)
+ fprint(2, "lex: got token %T\n", &a[ai-1]);
+ }
+ }
+ else {
+ // plain text (non-html) tokens
+ for(;;) {
+ if(ai == alen) {
+ a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
+ alen += ToksChunk;
+ }
+ tag = getplaindata(ts, a, &ai);
+ if(tag == -1)
+ break;
+ if(dbglex > 1)
+ fprint(2, "lex: got token %T\n", &a[ai]);
+ }
+ }
+ if(dbglex)
+ fprint(2, "lex: returning %d tokens\n", ai);
+ *plen = ai;
+ if(ai == 0)
+ return nil;
+ return a;
+}
+
+// For case where source isn't HTML.
+// Just make data tokens, one per line (or partial line,
+// at end of buffer), ignoring non-whitespace control
+// characters and dumping \r's.
+// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
+// Otherwise return -1;
+static int
+getplaindata(TokenSource* ts, Token* a, int* pai)
+{
+ Rune* s;
+ int j;
+ int starti;
+ int c;
+ Token* tok;
+ Rune buf[BIGBUFSIZE];
+
+ s = nil;
+ j = 0;
+ starti = ts->i;
+ for(c = getchar(ts); c >= 0; c = getchar(ts)) {
+ if(c < ' ') {
+ if(isspace(c)) {
+ if(c == '\r') {
+ // ignore it unless no following '\n',
+ // in which case treat it like '\n'
+ c = getchar(ts);
+ if(c != '\n') {
+ if(c >= 0)
+ ungetchar(ts, c);
+ c = '\n';
+ }
+ }
+ }
+ else
+ c = 0;
+ }
+ if(c != 0) {
+ buf[j++] = c;
+ if(j == sizeof(buf)-1) {
+ s = buftostr(s, buf, j);
+ j = 0;
+ }
+ }
+ if(c == '\n')
+ break;
+ }
+ s = buftostr(s, buf, j);
+ if(s == nil)
+ return -1;
+ tok = &a[(*pai)++];
+ tok->tag = Data;
+ tok->text = s;
+ tok->attr = nil;
+ tok->starti = starti;
+ return Data;
+}
+
+// Return concatenation of s and buf[0:j]
+static Rune*
+buftostr(Rune* s, Rune* buf, int j)
+{
+ buf[j] = 0;
+ if(s == nil)
+ s = _Strndup(buf, j);
+ else
+ s = _Strdup2(s, buf);
+ return s;
+}
+
+// Gather data up to next start-of-tag or end-of-buffer.
+// Translate entity references (&amp;).
+// Ignore non-whitespace control characters and get rid of \r's.
+// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
+// Otherwise return -1;
+static int
+getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
+{
+ Rune* s;
+ int j;
+ int c;
+ Token* tok;
+ Rune buf[BIGBUFSIZE];
+
+ s = nil;
+ j = 0;
+ c = firstc;
+ while(c >= 0) {
+ if(c == '&') {
+ c = ampersand(ts);
+ if(c < 0)
+ break;
+ }
+ else if(c < ' ') {
+ if(isspace(c)) {
+ if(c == '\r') {
+ // ignore it unless no following '\n',
+ // in which case treat it like '\n'
+ c = getchar(ts);
+ if(c != '\n') {
+ if(c >= 0)
+ ungetchar(ts, c);
+ c = '\n';
+ }
+ }
+ }
+ else {
+ if(warn)
+ fprint(2, "warning: non-whitespace control character %d ignored\n", c);
+ c = 0;
+ }
+ }
+ else if(c == '<') {
+ ungetchar(ts, c);
+ break;
+ }
+ if(c != 0) {
+ buf[j++] = c;
+ if(j == BIGBUFSIZE-1) {
+ s = buftostr(s, buf, j);
+ j = 0;
+ }
+ }
+ c = getchar(ts);
+ }
+ s = buftostr(s, buf, j);
+ if(s == nil)
+ return -1;
+ tok = &a[(*pai)++];
+ tok->tag = Data;
+ tok->text = s;
+ tok->attr = nil;
+ tok->starti = starti;
+ return Data;
+}
+
+// The rules for lexing scripts are different (ugh).
+// Gather up everything until see a </SCRIPT>.
+static int
+getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
+{
+ Rune* s;
+ int j;
+ int tstarti;
+ int savei;
+ int c;
+ int tag;
+ int done;
+ Token* tok;
+ Rune buf[BIGBUFSIZE];
+
+ s = nil;
+ j = 0;
+ tstarti = starti;
+ c = firstc;
+ done = 0;
+ while(c >= 0) {
+ if(c == '<') {
+ // other browsers ignore stuff to end of line after <!
+ savei = ts->i;
+ c = getchar(ts);
+ if(c == '!') {
+ while(c >= 0 && c != '\n' && c != '\r')
+ c = getchar(ts);
+ if(c == '\r')
+ c = getchar(ts);
+ if(c == '\n')
+ c = getchar(ts);
+ }
+ else if(c >= 0) {
+ backup(ts, savei);
+ tag = gettag(ts, tstarti, a, pai);
+ if(tag == -1)
+ break;
+ if(tag != Comment)
+ (*pai)--;
+ backup(ts, tstarti);
+ if(tag == Tscript + RBRA) {
+ done = 1;
+ break;
+ }
+ // here tag was not </SCRIPT>, so take as regular data
+ c = getchar(ts);
+ }
+ }
+ if(c < 0)
+ break;
+ if(c != 0) {
+ buf[j++] = c;
+ if(j == BIGBUFSIZE-1) {
+ s = buftostr(s, buf, j);
+ j = 0;
+ }
+ }
+ tstarti = ts->i;
+ c = getchar(ts);
+ }
+ if(done || ts->i == ts->edata) {
+ s = buftostr(s, buf, j);
+ tok = &a[(*pai)++];
+ tok->tag = Data;
+ tok->text = s;
+ tok->attr = nil;
+ tok->starti = starti;
+ return Data;
+ }
+ backup(ts, starti);
+ return -1;
+}
+
+// We've just seen a '<'. Gather up stuff to closing '>' (if buffer
+// ends before then, return -1).
+// If it's a tag, look up the name, gather the attributes, and return
+// the appropriate token.
+// Else it's either just plain data or some kind of ignorable stuff:
+// return Data or Comment as appropriate.
+// If it's not a Comment, put it in a[*pai] and bump *pai.
+static int
+gettag(TokenSource* ts, int starti, Token* a, int* pai)
+{
+ int rbra;
+ int ans;
+ Attr* al;
+ int nexti;
+ int c;
+ int ti;
+ int afnd;
+ int attid;
+ int quote;
+ Rune* val;
+ int nv;
+ int i;
+ int tag;
+ Token* tok;
+ Rune buf[BIGBUFSIZE];
+
+ rbra = 0;
+ nexti = ts->i;
+ tok = &a[*pai];
+ tok->tag = Notfound;
+ tok->text = nil;
+ tok->attr = nil;
+ tok->starti = starti;
+ c = getchar(ts);
+ if(c == '/') {
+ rbra = RBRA;
+ c = getchar(ts);
+ }
+ if(c < 0)
+ goto eob_done;
+ if(c >= 256 || !isalpha(c)) {
+ // not a tag
+ if(c == '!') {
+ ans = comment(ts);
+ if(ans != -1)
+ return ans;
+ goto eob_done;
+ }
+ else {
+ backup(ts, nexti);
+ tok->tag = Data;
+ tok->text = _Strdup(L(Llt));
+ (*pai)++;
+ return Data;
+ }
+ }
+ // c starts a tagname
+ buf[0] = c;
+ i = 1;
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ if(!ISNAMCHAR(c))
+ break;
+ // if name is bigger than buf it won't be found anyway...
+ if(i < BIGBUFSIZE)
+ buf[i++] = c;
+ }
+ if(_lookup(tagtable, Numtags, buf, i, &tag))
+ tok->tag = tag + rbra;
+ else
+ tok->text = _Strndup(buf, i); // for warning print, in build
+
+ // attribute gathering loop
+ al = nil;
+ while(1) {
+ // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
+ // skip whitespace
+attrloop_continue:
+ while(c < 256 && isspace(c)) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ }
+ if(c == '>')
+ goto attrloop_done;
+ if(c == '<') {
+ if(warn)
+ fprint(2, "warning: unclosed tag\n");
+ ungetchar(ts, c);
+ goto attrloop_done;
+ }
+ if(c >= 256 || !isalpha(c)) {
+ if(warn)
+ fprint(2, "warning: expected attribute name\n");
+ // skipt to next attribute name
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ if(c < 256 && isalpha(c))
+ goto attrloop_continue;
+ if(c == '<') {
+ if(warn)
+ fprint(2, "warning: unclosed tag\n");
+ ungetchar(ts, 60);
+ goto attrloop_done;
+ }
+ if(c == '>')
+ goto attrloop_done;
+ }
+ }
+ // gather attribute name
+ buf[0] = c;
+ i = 1;
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ if(!ISNAMCHAR(c))
+ break;
+ if(i < BIGBUFSIZE-1)
+ buf[i++] = c;
+ }
+ afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
+ if(warn && !afnd) {
+ buf[i] = 0;
+ fprint(2, "warning: unknown attribute name %S\n", buf);
+ }
+ // skip whitespace
+ while(c < 256 && isspace(c)) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ }
+ if(c != '=') {
+ if(afnd)
+ al = newattr(attid, nil, al);
+ goto attrloop_continue;
+ }
+ //# c is '=' here; skip whitespace
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ if(c >= 256 || !isspace(c))
+ break;
+ }
+ quote = 0;
+ if(c == '\'' || c == '"') {
+ quote = c;
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ }
+ val = nil;
+ nv = 0;
+ while(1) {
+valloop_continue:
+ if(c < 0)
+ goto eob_done;
+ if(c == '>') {
+ if(quote) {
+ // c might be part of string (though not good style)
+ // but if line ends before close quote, assume
+ // there was an unmatched quote
+ ti = ts->i;
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ if(c == quote) {
+ backup(ts, ti);
+ buf[nv++] = '>';
+ if(nv == BIGBUFSIZE-1) {
+ val = buftostr(val, buf, nv);
+ nv = 0;
+ }
+ c = getchar(ts);
+ goto valloop_continue;
+ }
+ if(c == '\n') {
+ if(warn)
+ fprint(2, "warning: apparent unmatched quote\n");
+ backup(ts, ti);
+ c = '>';
+ goto valloop_done;
+ }
+ }
+ }
+ else
+ goto valloop_done;
+ }
+ if(quote) {
+ if(c == quote) {
+ c = getchar(ts);
+ if(c < 0)
+ goto eob_done;
+ goto valloop_done;
+ }
+ if(c == '\r') {
+ c = getchar(ts);
+ goto valloop_continue;
+ }
+ if(c == '\t' || c == '\n')
+ c = ' ';
+ }
+ else {
+ if(c < 256 && isspace(c))
+ goto valloop_done;
+ }
+ if(c == '&') {
+ c = ampersand(ts);
+ if(c == -1)
+ goto eob_done;
+ }
+ buf[nv++] = c;
+ if(nv == BIGBUFSIZE-1) {
+ val = buftostr(val, buf, nv);
+ nv = 0;
+ }
+ c = getchar(ts);
+ }
+valloop_done:
+ if(afnd) {
+ val = buftostr(val, buf, nv);
+ al = newattr(attid, val, al);
+ }
+ }
+
+attrloop_done:
+ tok->attr = al;
+ (*pai)++;
+ return tok->tag;
+
+eob_done:
+ if(warn)
+ fprint(2, "warning: incomplete tag at end of page\n");
+ backup(ts, nexti);
+ tok->tag = Data;
+ tok->text = _Strdup(L(Llt));
+ return Data;
+}
+
+// We've just read a '<!' at position starti,
+// so this may be a comment or other ignored section, or it may
+// be just a literal string if there is no close before end of file
+// (other browsers do that).
+// The accepted practice seems to be (note: contrary to SGML spec!):
+// If see <!--, look for --> to close, or if none, > to close.
+// If see <!(not --), look for > to close.
+// If no close before end of file, leave original characters in as literal data.
+//
+// If we see ignorable stuff, return Comment.
+// Else return nil (caller should back up and try again when more data arrives,
+// unless at end of file, in which case caller should just make '<' a data token).
+static int
+comment(TokenSource* ts)
+{
+ int nexti;
+ int havecomment;
+ int c;
+
+ nexti = ts->i;
+ havecomment = 0;
+ c = getchar(ts);
+ if(c == '-') {
+ c = getchar(ts);
+ if(c == '-') {
+ if(findstr(ts, L(Larrow)))
+ havecomment = 1;
+ else
+ backup(ts, nexti);
+ }
+ }
+ if(!havecomment) {
+ if(c == '>')
+ havecomment = 1;
+ else if(c >= 0) {
+ if(findstr(ts, L(Lgt)))
+ havecomment = 1;
+ }
+ }
+ if(havecomment)
+ return Comment;
+ return -1;
+}
+
+// Look for string s in token source.
+// If found, return 1, with buffer at next char after s,
+// else return 0 (caller should back up).
+static int
+findstr(TokenSource* ts, Rune* s)
+{
+ int c0;
+ int n;
+ int nexti;
+ int i;
+ int c;
+
+ c0 = s[0];
+ n = runestrlen(s);
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ break;
+ if(c == c0) {
+ if(n == 1)
+ return 1;
+ nexti = ts->i;
+ for(i = 1; i < n; i++) {
+ c = getchar(ts);
+ if(c < 0)
+ goto mainloop_done;
+ if(c != s[i])
+ break;
+ }
+ if(i == n)
+ return 1;
+ backup(ts, nexti);
+ }
+ }
+mainloop_done:
+ return 0;
+}
+
+// We've just read an '&'; look for an entity reference
+// name, and if found, return translated char.
+// if there is a complete entity name but it isn't known,
+// try prefixes (gets around some buggy HTML out there),
+// and if that fails, back up to just past the '&' and return '&'.
+// If the entity can't be completed in the current buffer, back up
+// to the '&' and return -1.
+static int
+ampersand(TokenSource* ts)
+{
+ int savei;
+ int c;
+ int fnd;
+ int ans;
+ int v;
+ int i;
+ int k;
+ Rune buf[SMALLBUFSIZE];
+
+ savei = ts->i;
+ c = getchar(ts);
+ fnd = 0;
+ ans = -1;
+ if(c == '#') {
+ c = getchar(ts);
+ v = 0;
+ while(c >= 0) {
+ if(!(c < 256 && isdigit(c)))
+ break;
+ v = v*10 + c - 48;
+ c = getchar(ts);
+ }
+ if(c >= 0) {
+ if(!(c == ';' || c == '\n' || c == '\r'))
+ ungetchar(ts, c);
+ c = v;
+ if(c == 160)
+ c = 160;
+ if(c >= Winstart && c <= Winend) {
+ c = winchars[c - Winstart];
+ }
+ ans = c;
+ fnd = 1;
+ }
+ }
+ else if(c < 256 && isalpha(c)) {
+ buf[0] = c;
+ k = 1;
+ while(1) {
+ c = getchar(ts);
+ if(c < 0)
+ break;
+ if(ISNAMCHAR(c)) {
+ if(k < SMALLBUFSIZE-1)
+ buf[k++] = c;
+ }
+ else {
+ if(!(c == ';' || c == '\n' || c == '\r'))
+ ungetchar(ts, c);
+ break;
+ }
+ }
+ if(c >= 0) {
+ fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
+ if(!fnd) {
+ // Try prefixes of s
+ if(c == ';' || c == '\n' || c == '\r')
+ ungetchar(ts, c);
+ i = k;
+ while(--k > 0) {
+ fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
+ if(fnd) {
+ while(i > k) {
+ i--;
+ ungetchar(ts, buf[i]);
+ }
+ break;
+ }
+ }
+ }
+ }
+ }
+ if(!fnd) {
+ backup(ts, savei);
+ ans = '&';
+ }
+ return ans;
+}
+
+// Get next char, obeying ts.chset.
+// Returns -1 if no complete character left before current end of data.
+static int
+getchar(TokenSource* ts)
+{
+ uchar* buf;
+ int c;
+ int n;
+ int ok;
+ Rune r;
+
+ if(ts->i >= ts->edata)
+ return -1;
+ buf = ts->data;
+ c = buf[ts->i];
+ switch(ts->chset) {
+ case ISO_8859_1:
+ if(c >= Winstart && c <= Winend)
+ c = winchars[c - Winstart];
+ ts->i++;
+ break;
+ case US_Ascii:
+ if(c > 127) {
+ if(warn)
+ fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
+ }
+ ts->i++;
+ break;
+ case UTF_8:
+ ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
+ n = chartorune(&r, (char*)(buf+ts->i));
+ if(ok) {
+ if(warn && c == 0x80)
+ fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
+ ts->i += n;
+ c = r;
+ }
+ else {
+ // not enough bytes in buf to complete utf-8 char
+ ts->i = ts->edata; // mark "all used"
+ c = -1;
+ }
+ break;
+ case Unicode:
+ if(ts->i < ts->edata - 1) {
+ //standards say most-significant byte first
+ c = (c << 8)|(buf[ts->i + 1]);
+ ts->i += 2;
+ }
+ else {
+ ts->i = ts->edata; // mark "all used"
+ c = -1;
+ }
+ break;
+ }
+ return c;
+}
+
+// Assuming c was the last character returned by getchar, set
+// things up so that next getchar will get that same character
+// followed by the current 'next character', etc.
+static void
+ungetchar(TokenSource* ts, int c)
+{
+ int n;
+ Rune r;
+ char a[UTFmax];
+
+ n = 1;
+ switch(ts->chset) {
+ case UTF_8:
+ if(c >= 128) {
+ r = c;
+ n = runetochar(a, &r);
+ }
+ break;
+ case Unicode:
+ n = 2;
+ break;
+ }
+ ts->i -= n;
+}
+
+// Restore ts so that it is at the state where the index was savei.
+static void
+backup(TokenSource* ts, int savei)
+{
+ if(dbglex)
+ fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
+ ts->i = savei;
+}
+
+
+// Look for value associated with attribute attid in token t.
+// If there is one, return 1 and put the value in *pans,
+// else return 0.
+// If xfer is true, transfer ownership of the string to the caller
+// (nil it out here); otherwise, caller must duplicate the answer
+// if it needs to save it.
+// OK to have pans==0, in which case this is just looking
+// to see if token is present.
+int
+_tokaval(Token* t, int attid, Rune** pans, int xfer)
+{
+ Attr* attr;
+
+ attr = t->attr;
+ while(attr != nil) {
+ if(attr->attid == attid) {
+ if(pans != nil)
+ *pans = attr->value;
+ if(xfer)
+ attr->value = nil;
+ return 1;
+ }
+ attr = attr->next;
+ }
+ if(pans != nil)
+ *pans = nil;
+ return 0;
+}
+
+static int
+Tconv(Fmt *f)
+{
+ Token* t;
+ int i;
+ int tag;
+ char* srbra;
+ Rune* aname;
+ Rune* tname;
+ Attr* a;
+ char buf[BIGBUFSIZE];
+
+ t = va_arg(f->args, Token*);
+ if(t == nil)
+ sprint(buf, "<null>");
+ else {
+ i = 0;
+ if(dbglex > 1)
+ i = snprint(buf, sizeof(buf), "[%d]", t->starti);
+ tag = t->tag;
+ if(tag == Data) {
+ i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
+ }
+ else {
+ srbra = "";
+ if(tag >= RBRA) {
+ tag -= RBRA;
+ srbra = "/";
+ }
+ tname = tagnames[tag];
+ if(tag == Notfound)
+ tname = L(Lquestion);
+ i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
+ for(a = t->attr; a != nil; a = a->next) {
+ aname = attrnames[a->attid];
+ i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
+ if(a->value != nil)
+ i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
+ }
+ i += snprint(buf+i, sizeof(buf)-i-1, ">");
+ }
+ buf[i] = 0;
+ }
+ return fmtstrcpy(f, buf);
+}
+
+// Attrs own their constituent strings, but build may eventually
+// transfer some values to its items and nil them out in the Attr.
+static Attr*
+newattr(int attid, Rune* value, Attr* link)
+{
+ Attr* ans;
+
+ ans = (Attr*)emalloc(sizeof(Attr));
+ ans->attid = attid;
+ ans->value = value;
+ ans->next = link;
+ return ans;
+}
+
+// Free list of Attrs linked through next field
+static void
+freeattrs(Attr* ahead)
+{
+ Attr* a;
+ Attr* nexta;
+
+ a = ahead;
+ while(a != nil) {
+ nexta = a->next;
+ free(a->value);
+ free(a);
+ a = nexta;
+ }
+}
+
+// Free array of Tokens.
+// Allocated space might have room for more than n tokens,
+// but only n of them are initialized.
+// If caller has transferred ownership of constitutent strings
+// or attributes, it must have nil'd out the pointers in the Tokens.
+void
+_freetokens(Token* tarray, int n)
+{
+ int i;
+ Token* t;
+
+ if(tarray == nil)
+ return;
+ for(i = 0; i < n; i++) {
+ t = &tarray[i];
+ free(t->text);
+ freeattrs(t->attr);
+ }
+ free(tarray);
+}