diff options
author | wkj <devnull@localhost> | 2004-04-06 19:06:52 +0000 |
---|---|---|
committer | wkj <devnull@localhost> | 2004-04-06 19:06:52 +0000 |
commit | 7cf289ca89a7416999ae02330236042b0d37e3db (patch) | |
tree | 796d1363a7a53c72c28b199758ee674f1326a510 /src/libhtml | |
parent | 3e3817f7c86658f60715dd93768eaf8285807985 (diff) | |
download | plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.gz plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.bz2 plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.zip |
Import version of libhtml that might actually work with ANSI C.
Diffstat (limited to 'src/libhtml')
-rw-r--r-- | src/libhtml/build.c | 4238 | ||||
-rw-r--r-- | src/libhtml/impl.h | 163 | ||||
-rw-r--r-- | src/libhtml/lex.c | 1384 | ||||
-rw-r--r-- | src/libhtml/mkfile | 22 | ||||
-rw-r--r-- | src/libhtml/runetab.c | 83 | ||||
-rw-r--r-- | src/libhtml/runetab.h | 59 | ||||
-rw-r--r-- | src/libhtml/strinttab.c | 64 | ||||
-rw-r--r-- | src/libhtml/utils.c | 591 |
8 files changed, 6604 insertions, 0 deletions
diff --git a/src/libhtml/build.c b/src/libhtml/build.c new file mode 100644 index 00000000..32e64015 --- /dev/null +++ b/src/libhtml/build.c @@ -0,0 +1,4238 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <ctype.h> +#include <html.h> +#include "impl.h" + +// A stack for holding integer values +enum { + Nestmax = 40 // max nesting level of lists, font styles, etc. +}; + +struct Stack { + int n; // next available slot (top of stack is stack[n-1]) + int slots[Nestmax]; // stack entries +}; + +// Parsing state +struct Pstate +{ + Pstate* next; // in stack of Pstates + int skipping; // true when we shouldn't add items + int skipwhite; // true when we should strip leading space + int curfont; // font index for current font + int curfg; // current foreground color + Background curbg; // current background + int curvoff; // current baseline offset + uchar curul; // current underline/strike state + uchar curjust; // current justify state + int curanchor; // current (href) anchor id (if in one), or 0 + int curstate; // current value of item state + int literal; // current literal state + int inpar; // true when in a paragraph-like construct + int adjsize; // current font size adjustment + Item* items; // dummy head of item list we're building + Item* lastit; // tail of item list we're building + Item* prelastit; // item before lastit + Stack fntstylestk; // style stack + Stack fntsizestk; // size stack + Stack fgstk; // text color stack + Stack ulstk; // underline stack + Stack voffstk; // vertical offset stack + Stack listtypestk; // list type stack + Stack listcntstk; // list counter stack + Stack juststk; // justification stack + Stack hangstk; // hanging stack +}; + +struct ItemSource +{ + Docinfo* doc; + Pstate* psstk; + int nforms; + int ntables; + int nanchors; + int nframes; + Form* curform; + Map* curmap; + Table* tabstk; + Kidinfo* kidstk; +}; + +// Some layout parameters +enum { + FRKIDMARGIN = 6, // default margin around kid frames + IMGHSPACE = 0, // default hspace for images (0 matches IE, Netscape) + IMGVSPACE = 0, // default vspace for images + FLTIMGHSPACE = 2, // default hspace for float images + TABSP = 5, // default cellspacing for tables + TABPAD = 1, // default cell padding for tables + LISTTAB = 1, // number of tabs to indent lists + BQTAB = 1, // number of tabs to indent blockquotes + HRSZ = 2, // thickness of horizontal rules + SUBOFF = 4, // vertical offset for subscripts + SUPOFF = 6, // vertical offset for superscripts + NBSP = 160 // non-breaking space character +}; + +// These tables must be sorted +static StringInt *align_tab; +static AsciiInt _align_tab[] = { + {"baseline", ALbaseline}, + {"bottom", ALbottom}, + {"center", ALcenter}, + {"char", ALchar}, + {"justify", ALjustify}, + {"left", ALleft}, + {"middle", ALmiddle}, + {"right", ALright}, + {"top", ALtop} +}; +#define NALIGNTAB (sizeof(align_tab)/sizeof(StringInt)) + +static StringInt *input_tab; +static AsciiInt _input_tab[] = { + {"button", Fbutton}, + {"checkbox", Fcheckbox}, + {"file", Ffile}, + {"hidden", Fhidden}, + {"image", Fimage}, + {"password", Fpassword}, + {"radio", Fradio}, + {"reset", Freset}, + {"submit", Fsubmit}, + {"text", Ftext} +}; +#define NINPUTTAB (sizeof(input_tab)/sizeof(StringInt)) + +static StringInt *clear_tab; +static AsciiInt _clear_tab[] = { + {"all", IFcleft|IFcright}, + {"left", IFcleft}, + {"right", IFcright} +}; +#define NCLEARTAB (sizeof(clear_tab)/sizeof(StringInt)) + +static StringInt *fscroll_tab; +static AsciiInt _fscroll_tab[] = { + {"auto", FRhscrollauto|FRvscrollauto}, + {"no", FRnoscroll}, + {"yes", FRhscroll|FRvscroll}, +}; +#define NFSCROLLTAB (sizeof(fscroll_tab)/sizeof(StringInt)) + +static StringInt *shape_tab; +static AsciiInt _shape_tab[] = { + {"circ", SHcircle}, + {"circle", SHcircle}, + {"poly", SHpoly}, + {"polygon", SHpoly}, + {"rect", SHrect}, + {"rectangle", SHrect} +}; +#define NSHAPETAB (sizeof(shape_tab)/sizeof(StringInt)) + +static StringInt *method_tab; +static AsciiInt _method_tab[] = { + {"get", HGet}, + {"post", HPost} +}; +#define NMETHODTAB (sizeof(method_tab)/sizeof(StringInt)) + +static Rune** roman; +static char* _roman[15]= { + "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", + "XI", "XII", "XIII", "XIV", "XV" +}; +#define NROMAN 15 + +// List number types +enum { + LTdisc, LTsquare, LTcircle, LT1, LTa, LTA, LTi, LTI +}; + +enum { + SPBefore = 2, + SPAfter = 4, + BL = 1, + BLBA = (BL|SPBefore|SPAfter) +}; + +// blockbrk[tag] is break info for a block level element, or one +// of a few others that get the same treatment re ending open paragraphs +// and requiring a line break / vertical space before them. +// If we want a line of space before the given element, SPBefore is OR'd in. +// If we want a line of space after the given element, SPAfter is OR'd in. + +static uchar blockbrk[Numtags]= { + [Taddress] BLBA, [Tblockquote] BLBA, [Tcenter] BL, + [Tdir] BLBA, [Tdiv] BL, [Tdd] BL, [Tdl] BLBA, + [Tdt] BL, [Tform] BLBA, + // headings and tables get breaks added manually + [Th1] BL, [Th2] BL, [Th3] BL, + [Th4] BL, [Th5] BL, [Th6] BL, + [Thr] BL, [Tisindex] BLBA, [Tli] BL, [Tmenu] BLBA, + [Tol] BLBA, [Tp] BLBA, [Tpre] BLBA, + [Tul] BLBA +}; + +enum { + AGEN = 1 +}; + +// attrinfo is information about attributes. +// The AGEN value means that the attribute is generic (applies to almost all elements) +static uchar attrinfo[Numattrs]= { + [Aid] AGEN, [Aclass] AGEN, [Astyle] AGEN, [Atitle] AGEN, + [Aonblur] AGEN, [Aonchange] AGEN, [Aonclick] AGEN, + [Aondblclick] AGEN, [Aonfocus] AGEN, [Aonkeypress] AGEN, + [Aonkeyup] AGEN, [Aonload] AGEN, [Aonmousedown] AGEN, + [Aonmousemove] AGEN, [Aonmouseout] AGEN, [Aonmouseover] AGEN, + [Aonmouseup] AGEN, [Aonreset] AGEN, [Aonselect] AGEN, + [Aonsubmit] AGEN, [Aonunload] AGEN +}; + +static uchar scriptev[Numattrs]= { + [Aonblur] SEonblur, [Aonchange] SEonchange, [Aonclick] SEonclick, + [Aondblclick] SEondblclick, [Aonfocus] SEonfocus, [Aonkeypress] SEonkeypress, + [Aonkeyup] SEonkeyup, [Aonload] SEonload, [Aonmousedown] SEonmousedown, + [Aonmousemove] SEonmousemove, [Aonmouseout] SEonmouseout, [Aonmouseover] SEonmouseover, + [Aonmouseup] SEonmouseup, [Aonreset] SEonreset, [Aonselect] SEonselect, + [Aonsubmit] SEonsubmit, [Aonunload] SEonunload +}; + +// Color lookup table +static StringInt *color_tab; +static AsciiInt _color_tab[] = { + {"aqua", 0x00FFFF}, + {"black", 0x000000}, + {"blue", 0x0000CC}, + {"fuchsia", 0xFF00FF}, + {"gray", 0x808080}, + {"green", 0x008000}, + {"lime", 0x00FF00}, + {"maroon", 0x800000}, + {"navy", 0x000080,}, + {"olive", 0x808000}, + {"purple", 0x800080}, + {"red", 0xFF0000}, + {"silver", 0xC0C0C0}, + {"teal", 0x008080}, + {"white", 0xFFFFFF}, + {"yellow", 0xFFFF00} +}; +#define NCOLORS (sizeof(color_tab)/sizeof(StringInt)) + +static StringInt *targetmap; +static int targetmapsize; +static int ntargets; + +static int buildinited = 0; + +#define SMALLBUFSIZE 240 +#define BIGBUFSIZE 2000 + +int dbgbuild = 0; +int warn = 0; + +static Align aalign(Token* tok); +static int acolorval(Token* tok, int attid, int dflt); +static void addbrk(Pstate* ps, int sp, int clr); +static void additem(Pstate* ps, Item* it, Token* tok); +static void addlinebrk(Pstate* ps, int clr); +static void addnbsp(Pstate* ps); +static void addtext(Pstate* ps, Rune* s); +static Dimen adimen(Token* tok, int attid); +static int aflagval(Token* tok, int attid); +static int aintval(Token* tok, int attid, int dflt); +static Rune* astrval(Token* tok, int attid, Rune* dflt); +static int atabval(Token* tok, int attid, StringInt* tab, int ntab, int dflt); +static int atargval(Token* tok, int dflt); +static int auintval(Token* tok, int attid, int dflt); +static Rune* aurlval(Token* tok, int attid, Rune* dflt, Rune* base); +static Rune* aval(Token* tok, int attid); +static void buildinit(void); +static Pstate* cell_pstate(Pstate* oldps, int ishead); +static void changehang(Pstate* ps, int delta); +static void changeindent(Pstate* ps, int delta); +static int color(Rune* s, int dflt); +static void copystack(Stack* tostk, Stack* fromstk); +static int dimprint(char* buf, int nbuf, Dimen d); +static Pstate* finishcell(Table* curtab, Pstate* psstk); +static void finish_table(Table* t); +static void freeanchor(Anchor* a); +static void freedestanchor(DestAnchor* da); +static void freeform(Form* f); +static void freeformfield(Formfield* ff); +static void freeitem(Item* it); +static void freepstate(Pstate* p); +static void freepstatestack(Pstate* pshead); +static void freescriptevents(SEvent* ehead); +static void freetable(Table* t); +static Map* getmap(Docinfo* di, Rune* name); +static Rune* getpcdata(Token* toks, int tokslen, int* ptoki); +static Pstate* lastps(Pstate* psl); +static Rune* listmark(uchar ty, int n); +static int listtyval(Token* tok, int dflt); +static Align makealign(int halign, int valign); +static Background makebackground(Rune* imgurl, int color); +static Dimen makedimen(int kind, int spec); +static Anchor* newanchor(int index, Rune* name, Rune* href, int target, Anchor* link); +static Area* newarea(int shape, Rune* href, int target, Area* link); +static DestAnchor* newdestanchor(int index, Rune* name, Item* item, DestAnchor* link); +static Docinfo* newdocinfo(void); +static Genattr* newgenattr(Rune* id, Rune* class, Rune* style, Rune* title, SEvent* events); +static Form* newform(int formid, Rune* name, Rune* action, + int target, int method, Form* link); +static Formfield* newformfield(int ftype, int fieldid, Form* form, Rune* name, + Rune* value, int size, int maxlength, Formfield* link); +static Item* newifloat(Item* it, int side); +static Item* newiformfield(Formfield* ff); +static Item* newiimage(Rune* src, Rune* altrep, int align, int width, int height, + int hspace, int vspace, int border, int ismap, Map* map); +static Item* newirule(int align, int size, int noshade, Dimen wspec); +static Item* newispacer(int spkind); +static Item* newitable(Table* t); +static ItemSource* newitemsource(Docinfo* di); +static Item* newitext(Rune* s, int fnt, int fg, int voff, int ul); +static Kidinfo* newkidinfo(int isframeset, Kidinfo* link); +static Option* newoption(int selected, Rune* value, Rune* display, Option* link); +static Pstate* newpstate(Pstate* link); +static SEvent* newscriptevent(int type, Rune* script, SEvent* link); +static Table* newtable(int tableid, Align align, Dimen width, int border, + int cellspacing, int cellpadding, Background bg, Token* tok, Table* link); +static Tablecell* newtablecell(int cellid, int rowspan, int colspan, Align align, Dimen wspec, + int hspec, Background bg, int flags, Tablecell* link); +static Tablerow* newtablerow(Align align, Background bg, int flags, Tablerow* link); +static Dimen parsedim(Rune* s, int ns); +static void pop(Stack* stk); +static void popfontsize(Pstate* ps); +static void popfontstyle(Pstate* ps); +static void popjust(Pstate* ps); +static int popretnewtop(Stack* stk, int dflt); +static int push(Stack* stk, int val); +static void pushfontsize(Pstate* ps, int sz); +static void pushfontstyle(Pstate* ps, int sty); +static void pushjust(Pstate* ps, int j); +static Item* textit(Pstate* ps, Rune* s); +static Rune* removeallwhite(Rune* s); +static void resetdocinfo(Docinfo* d); +static void setcurfont(Pstate* ps); +static void setcurjust(Pstate* ps); +static void setdimarray(Token* tok, int attid, Dimen** pans, int* panslen); +static Rune* stringalign(int a); +static void targetmapinit(void); +static int toint(Rune* s); +static int top(Stack* stk, int dflt); +static void trim_cell(Tablecell* c); +static int validalign(Align a); +static int validdimen(Dimen d); +static int validformfield(Formfield* f); +static int validhalign(int a); +static int validptr(void* p); +static int validStr(Rune* s); +static int validtable(Table* t); +static int validtablerow(Tablerow* r); +static int validtablecol(Tablecol* c); +static int validtablecell(Tablecell* c); +static int validvalign(int a); +static int Iconv(Fmt *f); + +static void +buildinit(void) +{ + runetabinit(); + roman = cvtstringtab(_roman, nelem(_roman)); + color_tab = cvtstringinttab(_color_tab, nelem(_color_tab)); + method_tab = cvtstringinttab(_method_tab, nelem(_method_tab)); + shape_tab = cvtstringinttab(_shape_tab, nelem(_shape_tab)); + fscroll_tab = cvtstringinttab(_fscroll_tab, nelem(_fscroll_tab)); + clear_tab = cvtstringinttab(_clear_tab, nelem(_clear_tab)); + input_tab = cvtstringinttab(_input_tab, nelem(_input_tab)); + align_tab = cvtstringinttab(_align_tab, nelem(_align_tab)); + + fmtinstall('I', Iconv); + targetmapinit(); + buildinited = 1; +} + +static ItemSource* +newitemsource(Docinfo* di) +{ + ItemSource* is; + Pstate* ps; + + ps = newpstate(nil); + if(di->mediatype != TextHtml) { + ps->curstate &= ~IFwrap; + ps->literal = 1; + pushfontstyle(ps, FntT); + } + is = (ItemSource*)emalloc(sizeof(ItemSource)); + is->doc = di; + is->psstk = ps; + is->nforms = 0; + is->ntables = 0; + is->nanchors = 0; + is->nframes = 0; + is->curform = nil; + is->curmap = nil; + is->tabstk = nil; + is->kidstk = nil; + return is; +} + +static Item *getitems(ItemSource* is, uchar* data, int datalen); + +// Parse an html document and create a list of layout items. +// Allocate and return document info in *pdi. +// When caller is done with the items, it should call +// freeitems on the returned result, and then +// freedocinfo(*pdi). +Item* +parsehtml(uchar* data, int datalen, Rune* pagesrc, int mtype, int chset, Docinfo** pdi) +{ + Item *it; + Docinfo* di; + ItemSource* is; + + di = newdocinfo(); + di->src = _Strdup(pagesrc); + di->base = _Strdup(pagesrc); + di->mediatype = mtype; + di->chset = chset; + *pdi = di; + is = newitemsource(di); + it = getitems(is, data, datalen); + freepstatestack(is->psstk); + free(is); + return it; +} + +// Get a group of tokens for lexer, parse them, and create +// a list of layout items. +// When caller is done with the items, it should call +// freeitems on the returned result. +static Item* +getitems(ItemSource* is, uchar* data, int datalen) +{ + int i; + int j; + int nt; + int pt; + int doscripts; + int tokslen; + int toki; + int h; + int sz; + int method; + int n; + int nblank; + int norsz; + int bramt; + int sty; + int nosh; + int oldcuranchor; + int dfltbd; + int v; + int hang; + int isempty; + int tag; + int brksp; + int target; + uchar brk; + uchar flags; + uchar align; + uchar al; + uchar ty; + uchar ty2; + Pstate* ps; + Pstate* nextps; + Pstate* outerps; + Table* curtab; + Token* tok; + Token* toks; + Docinfo* di; + Item* ans; + Item* img; + Item* ffit; + Item* tabitem; + Rune* s; + Rune* t; + Rune* name; + Rune* enctype; + Rune* usemap; + Rune* prompt; + Rune* equiv; + Rune* val; + Rune* nsz; + Rune* script; + Map* map; + Form* frm; + Iimage* ii; + Kidinfo* kd; + Kidinfo* ks; + Kidinfo* pks; + Dimen wd; + Option* option; + Table* tab; + Tablecell* c; + Tablerow* tr; + Formfield* field; + Formfield* ff; + Rune* href; + Rune* src; + Rune* scriptsrc; + Rune* bgurl; + Rune* action; + Background bg; + + if(!buildinited) + buildinit(); + doscripts = 0; // for now + ps = is->psstk; + curtab = is->tabstk; + di = is->doc; + toks = _gettoks(data, datalen, di->chset, di->mediatype, &tokslen); + toki = 0; + for(; toki < tokslen; toki++) { + tok = &toks[toki]; + if(dbgbuild > 1) + fprint(2, "build: curstate %ux, token %T\n", ps->curstate, tok); + tag = tok->tag; + brk = 0; + brksp = 0; + if(tag < Numtags) { + brk = blockbrk[tag]; + if(brk&SPBefore) + brksp = 1; + } + else if(tag < Numtags + RBRA) { + brk = blockbrk[tag - RBRA]; + if(brk&SPAfter) + brksp = 1; + } + if(brk) { + addbrk(ps, brksp, 0); + if(ps->inpar) { + popjust(ps); + ps->inpar = 0; + } + } + // check common case first (Data), then switch statement on tag + if(tag == Data) { + // Lexing didn't pay attention to SGML record boundary rules: + // \n after start tag or before end tag to be discarded. + // (Lex has already discarded all \r's). + // Some pages assume this doesn't happen in <PRE> text, + // so we won't do it if literal is true. + // BUG: won't discard \n before a start tag that begins + // the next bufferful of tokens. + s = tok->text; + n = _Strlen(s); + if(!ps->literal) { + i = 0; + j = n; + if(toki > 0) { + pt = toks[toki - 1].tag; + // IE and Netscape both ignore this rule (contrary to spec) + // if previous tag was img + if(pt < Numtags && pt != Timg && j > 0 && s[0] == '\n') + i++; + } + if(toki < tokslen - 1) { + nt = toks[toki + 1].tag; + if(nt >= RBRA && nt < Numtags + RBRA && j > i && s[j - 1] == '\n') + j--; + } + if(i > 0 || j < n) { + t = s; + s = _Strsubstr(s, i, j); + free(t); + n = j-i; + } + } + if(ps->skipwhite) { + _trimwhite(s, n, &t, &nt); + if(t == nil) { + free(s); + s = nil; + } + else if(t != s) { + t = _Strndup(t, nt); + free(s); + s = t; + } + if(s != nil) + ps->skipwhite = 0; + } + tok->text = nil; // token doesn't own string anymore + if(s != nil) + addtext(ps, s); + } + else + switch(tag) { + // Some abbrevs used in following DTD comments + // %text = #PCDATA + // | TT | I | B | U | STRIKE | BIG | SMALL | SUB | SUP + // | EM | STRONG | DFN | CODE | SAMP | KBD | VAR | CITE + // | A | IMG | APPLET | FONT | BASEFONT | BR | SCRIPT | MAP + // | INPUT | SELECT | TEXTAREA + // %block = P | UL | OL | DIR | MENU | DL | PRE | DL | DIV | CENTER + // | BLOCKQUOTE | FORM | ISINDEX | HR | TABLE + // %flow = (%text | %block)* + // %body.content = (%heading | %text | %block | ADDRESS)* + + // <!ELEMENT A - - (%text) -(A)> + // Anchors are not supposed to be nested, but you sometimes see + // href anchors inside destination anchors. + case Ta: + if(ps->curanchor != 0) { + if(warn) + fprint(2, "warning: nested <A> or missing </A>\n"); + ps->curanchor = 0; + } + name = aval(tok, Aname); + href = aurlval(tok, Ahref, nil, di->base); + // ignore rel, rev, and title attrs + if(href != nil) { + target = atargval(tok, di->target); + di->anchors = newanchor(++is->nanchors, name, href, target, di->anchors); + if(name != nil) + name = _Strdup(name); // for DestAnchor construction, below + ps->curanchor = is->nanchors; + ps->curfg = push(&ps->fgstk, di->link); + ps->curul = push(&ps->ulstk, ULunder); + } + if(name != nil) { + // add a null item to be destination + additem(ps, newispacer(ISPnull), tok); + di->dests = newdestanchor(++is->nanchors, name, ps->lastit, di->dests); + } + break; + + case Ta+RBRA : + if(ps->curanchor != 0) { + ps->curfg = popretnewtop(&ps->fgstk, di->text); + ps->curul = popretnewtop(&ps->ulstk, ULnone); + ps->curanchor = 0; + } + break; + + // <!ELEMENT APPLET - - (PARAM | %text)* > + // We can't do applets, so ignore PARAMS, and let + // the %text contents appear for the alternative rep + case Tapplet: + case Tapplet+RBRA: + if(warn && tag == Tapplet) + fprint(2, "warning: <APPLET> ignored\n"); + break; + + // <!ELEMENT AREA - O EMPTY> + case Tarea: + map = di->maps; + if(map == nil) { + if(warn) + fprint(2, "warning: <AREA> not inside <MAP>\n"); + continue; + } + map->areas = newarea(atabval(tok, Ashape, shape_tab, NSHAPETAB, SHrect), + aurlval(tok, Ahref, nil, di->base), + atargval(tok, di->target), + map->areas); + setdimarray(tok, Acoords, &map->areas->coords, &map->areas->ncoords); + break; + + // <!ELEMENT (B|STRONG) - - (%text)*> + case Tb: + case Tstrong: + pushfontstyle(ps, FntB); + break; + + case Tb+RBRA: + case Tcite+RBRA: + case Tcode+RBRA: + case Tdfn+RBRA: + case Tem+RBRA: + case Tkbd+RBRA: + case Ti+RBRA: + case Tsamp+RBRA: + case Tstrong+RBRA: + case Ttt+RBRA: + case Tvar+RBRA : + case Taddress+RBRA: + popfontstyle(ps); + break; + + // <!ELEMENT BASE - O EMPTY> + case Tbase: + t = di->base; + di->base = aurlval(tok, Ahref, di->base, di->base); + if(t != nil) + free(t); + di->target = atargval(tok, di->target); + break; + + // <!ELEMENT BASEFONT - O EMPTY> + case Tbasefont: + ps->adjsize = aintval(tok, Asize, 3) - 3; + break; + + // <!ELEMENT (BIG|SMALL) - - (%text)*> + case Tbig: + case Tsmall: + sz = ps->adjsize; + if(tag == Tbig) + sz += Large; + else + sz += Small; + pushfontsize(ps, sz); + break; + + case Tbig+RBRA: + case Tsmall+RBRA: + popfontsize(ps); + break; + + // <!ELEMENT BLOCKQUOTE - - %body.content> + case Tblockquote: + changeindent(ps, BQTAB); + break; + + case Tblockquote+RBRA: + changeindent(ps, -BQTAB); + break; + + // <!ELEMENT BODY O O %body.content> + case Tbody: + ps->skipping = 0; + bg = makebackground(nil, acolorval(tok, Abgcolor, di->background.color)); + bgurl = aurlval(tok, Abackground, nil, di->base); + if(bgurl != nil) { + if(di->backgrounditem != nil) + freeitem((Item*)di->backgrounditem); + // really should remove old item from di->images list, + // but there should only be one BODY element ... + di->backgrounditem = (Iimage*)newiimage(bgurl, nil, ALnone, 0, 0, 0, 0, 0, 0, nil); + di->backgrounditem->nextimage = di->images; + di->images = di->backgrounditem; + } + ps->curbg = bg; + di->background = bg; + di->text = acolorval(tok, Atext, di->text); + di->link = acolorval(tok, Alink, di->link); + di->vlink = acolorval(tok, Avlink, di->vlink); + di->alink = acolorval(tok, Aalink, di->alink); + if(di->text != ps->curfg) { + ps->curfg = di->text; + ps->fgstk.n = 0; + } + break; + + case Tbody+RBRA: + // HTML spec says ignore things after </body>, + // but IE and Netscape don't + // ps.skipping = 1; + break; + + // <!ELEMENT BR - O EMPTY> + case Tbr: + addlinebrk(ps, atabval(tok, Aclear, clear_tab, NCLEARTAB, 0)); + break; + + // <!ELEMENT CAPTION - - (%text;)*> + case Tcaption: + if(curtab == nil) { + if(warn) + fprint(2, "warning: <CAPTION> outside <TABLE>\n"); + continue; + } + if(curtab->caption != nil) { + if(warn) + fprint(2, "warning: more than one <CAPTION> in <TABLE>\n"); + continue; + } + ps = newpstate(ps); + curtab->caption_place = atabval(tok, Aalign, align_tab, NALIGNTAB, ALtop); + break; + + case Tcaption+RBRA: + nextps = ps->next; + if(curtab == nil || nextps == nil) { + if(warn) + fprint(2, "warning: unexpected </CAPTION>\n"); + continue; + } + curtab->caption = ps->items->next; + free(ps); + ps = nextps; + break; + + case Tcenter: + case Tdiv: + if(tag == Tcenter) + al = ALcenter; + else + al = atabval(tok, Aalign, align_tab, NALIGNTAB, ps->curjust); + pushjust(ps, al); + break; + + case Tcenter+RBRA: + case Tdiv+RBRA: + popjust(ps); + break; + + // <!ELEMENT DD - O %flow > + case Tdd: + if(ps->hangstk.n == 0) { + if(warn) + fprint(2, "warning: <DD> not inside <DL\n"); + continue; + } + h = top(&ps->hangstk, 0); + if(h != 0) + changehang(ps, -10*LISTTAB); + else + addbrk(ps, 0, 0); + push(&ps->hangstk, 0); + break; + + //<!ELEMENT (DIR|MENU) - - (LI)+ -(%block) > + //<!ELEMENT (OL|UL) - - (LI)+> + case Tdir: + case Tmenu: + case Tol: + case Tul: + changeindent(ps, LISTTAB); + push(&ps->listtypestk, listtyval(tok, (tag==Tol)? LT1 : LTdisc)); + push(&ps->listcntstk, aintval(tok, Astart, 1)); + break; + + case Tdir+RBRA: + case Tmenu+RBRA: + case Tol+RBRA: + case Tul+RBRA: + if(ps->listtypestk.n == 0) { + if(warn) + fprint(2, "warning: %T ended no list\n", tok); + continue; + } + addbrk(ps, 0, 0); + pop(&ps->listtypestk); + pop(&ps->listcntstk); + changeindent(ps, -LISTTAB); + break; + + // <!ELEMENT DL - - (DT|DD)+ > + case Tdl: + changeindent(ps, LISTTAB); + push(&ps->hangstk, 0); + break; + + case Tdl+RBRA: + if(ps->hangstk.n == 0) { + if(warn) + fprint(2, "warning: unexpected </DL>\n"); + continue; + } + changeindent(ps, -LISTTAB); + if(top(&ps->hangstk, 0) != 0) + changehang(ps, -10*LISTTAB); + pop(&ps->hangstk); + break; + + // <!ELEMENT DT - O (%text)* > + case Tdt: + if(ps->hangstk.n == 0) { + if(warn) + fprint(2, "warning: <DT> not inside <DL>\n"); + continue; + } + h = top(&ps->hangstk, 0); + pop(&ps->hangstk); + if(h != 0) + changehang(ps, -10*LISTTAB); + changehang(ps, 10*LISTTAB); + push(&ps->hangstk, 1); + break; + + // <!ELEMENT FONT - - (%text)*> + case Tfont: + sz = top(&ps->fntsizestk, Normal); + if(_tokaval(tok, Asize, &nsz, 0)) { + if(_prefix(L(Lplus), nsz)) + sz = Normal + _Strtol(nsz+1, nil, 10) + ps->adjsize; + else if(_prefix(L(Lminus), nsz)) + sz = Normal - _Strtol(nsz+1, nil, 10) + ps->adjsize; + else if(nsz != nil) + sz = Normal + (_Strtol(nsz, nil, 10) - 3); + } + ps->curfg = push(&ps->fgstk, acolorval(tok, Acolor, ps->curfg)); + pushfontsize(ps, sz); + break; + + case Tfont+RBRA: + if(ps->fgstk.n == 0) { + if(warn) + fprint(2, "warning: unexpected </FONT>\n"); + continue; + } + ps->curfg = popretnewtop(&ps->fgstk, di->text); + popfontsize(ps); + break; + + // <!ELEMENT FORM - - %body.content -(FORM) > + case Tform: + if(is->curform != nil) { + if(warn) + fprint(2, "warning: <FORM> nested inside another\n"); + continue; + } + action = aurlval(tok, Aaction, di->base, di->base); + s = aval(tok, Aid); + name = astrval(tok, Aname, s); + if(s) + free(s); + target = atargval(tok, di->target); + method = atabval(tok, Amethod, method_tab, NMETHODTAB, HGet); + if(warn && _tokaval(tok, Aenctype, &enctype, 0) && + _Strcmp(enctype, L(Lappl_form))) + fprint(2, "form enctype %S not handled\n", enctype); + frm = newform(++is->nforms, name, action, target, method, di->forms); + di->forms = frm; + is->curform = frm; + break; + + case Tform+RBRA: + if(is->curform == nil) { + if(warn) + fprint(2, "warning: unexpected </FORM>\n"); + continue; + } + // put fields back in input order + is->curform->fields = (Formfield*)_revlist((List*)is->curform->fields); + is->curform = nil; + break; + + // <!ELEMENT FRAME - O EMPTY> + case Tframe: + ks = is->kidstk; + if(ks == nil) { + if(warn) + fprint(2, "warning: <FRAME> not in <FRAMESET>\n"); + continue; + } + ks->kidinfos = kd = newkidinfo(0, ks->kidinfos); + kd->src = aurlval(tok, Asrc, nil, di->base); + kd->name = aval(tok, Aname); + if(kd->name == nil) { + s = _ltoStr(++is->nframes); + kd->name = _Strdup2(L(Lfr), s); + free(s); + } + kd->marginw = auintval(tok, Amarginwidth, 0); + kd->marginh = auintval(tok, Amarginheight, 0); + kd->framebd = auintval(tok, Aframeborder, 1); + kd->flags = atabval(tok, Ascrolling, fscroll_tab, NFSCROLLTAB, kd->flags); + norsz = aflagval(tok, Anoresize); + if(norsz) + kd->flags |= FRnoresize; + break; + + // <!ELEMENT FRAMESET - - (FRAME|FRAMESET)+> + case Tframeset: + ks = newkidinfo(1, nil); + pks = is->kidstk; + if(pks == nil) + di->kidinfo = ks; + else { + ks->next = pks->kidinfos; + pks->kidinfos = ks; + } + ks->nextframeset = pks; + is->kidstk = ks; + setdimarray(tok, Arows, &ks->rows, &ks->nrows); + if(ks->nrows == 0) { + ks->rows = (Dimen*)emalloc(sizeof(Dimen)); + ks->nrows = 1; + ks->rows[0] = makedimen(Dpercent, 100); + } + setdimarray(tok, Acols, &ks->cols, &ks->ncols); + if(ks->ncols == 0) { + ks->cols = (Dimen*)emalloc(sizeof(Dimen)); + ks->ncols = 1; + ks->cols[0] = makedimen(Dpercent, 100); + } + break; + + case Tframeset+RBRA: + if(is->kidstk == nil) { + if(warn) + fprint(2, "warning: unexpected </FRAMESET>\n"); + continue; + } + ks = is->kidstk; + // put kids back in original order + // and add blank frames to fill out cells + n = ks->nrows*ks->ncols; + nblank = n - _listlen((List*)ks->kidinfos); + while(nblank-- > 0) + ks->kidinfos = newkidinfo(0, ks->kidinfos); + ks->kidinfos = (Kidinfo*)_revlist((List*)ks->kidinfos); + is->kidstk = is->kidstk->nextframeset; + if(is->kidstk == nil) { + // end input + ans = nil; + goto return_ans; + } + break; + + // <!ELEMENT H1 - - (%text;)*>, etc. + case Th1: + case Th2: + case Th3: + case Th4: + case Th5: + case Th6: + bramt = 1; + if(ps->items == ps->lastit) + bramt = 0; + addbrk(ps, bramt, IFcleft|IFcright); + sz = Verylarge - (tag - Th1); + if(sz < Tiny) + sz = Tiny; + pushfontsize(ps, sz); + sty = top(&ps->fntstylestk, FntR); + if(tag == Th1) + sty = FntB; + pushfontstyle(ps, sty); + pushjust(ps, atabval(tok, Aalign, align_tab, NALIGNTAB, ps->curjust)); + ps->skipwhite = 1; + break; + + case Th1+RBRA: + case Th2+RBRA: + case Th3+RBRA: + case Th4+RBRA: + case Th5+RBRA: + case Th6+RBRA: + addbrk(ps, 1, IFcleft|IFcright); + popfontsize(ps); + popfontstyle(ps); + popjust(ps); + break; + + case Thead: + // HTML spec says ignore regular markup in head, + // but Netscape and IE don't + // ps.skipping = 1; + break; + + case Thead+RBRA: + ps->skipping = 0; + break; + + // <!ELEMENT HR - O EMPTY> + case Thr: + al = atabval(tok, Aalign, align_tab, NALIGNTAB, ALcenter); + sz = auintval(tok, Asize, HRSZ); + wd = adimen(tok, Awidth); + if(dimenkind(wd) == Dnone) + wd = makedimen(Dpercent, 100); + nosh = aflagval(tok, Anoshade); + additem(ps, newirule(al, sz, nosh, wd), tok); + addbrk(ps, 0, 0); + break; + + case Ti: + case Tcite: + case Tdfn: + case Tem: + case Tvar: + case Taddress: + pushfontstyle(ps, FntI); + break; + + // <!ELEMENT IMG - O EMPTY> + case Timg: + map = nil; + oldcuranchor = ps->curanchor; + if(_tokaval(tok, Ausemap, &usemap, 0)) { + if(!_prefix(L(Lhash), usemap)) { + if(warn) + fprint(2, "warning: can't handle non-local map %S\n", usemap); + } + else { + map = getmap(di, usemap+1); + if(ps->curanchor == 0) { + di->anchors = newanchor(++is->nanchors, nil, nil, di->target, di->anchors); + ps->curanchor = is->nanchors; + } + } + } + align = atabval(tok, Aalign, align_tab, NALIGNTAB, ALbottom); + dfltbd = 0; + if(ps->curanchor != 0) + dfltbd = 2; + src = aurlval(tok, Asrc, nil, di->base); + if(src == nil) { + if(warn) + fprint(2, "warning: <img> has no src attribute\n"); + ps->curanchor = oldcuranchor; + continue; + } + img = newiimage(src, + aval(tok, Aalt), + align, + auintval(tok, Awidth, 0), + auintval(tok, Aheight, 0), + auintval(tok, Ahspace, IMGHSPACE), + auintval(tok, Avspace, IMGVSPACE), + auintval(tok, Aborder, dfltbd), + aflagval(tok, Aismap), + map); + if(align == ALleft || align == ALright) { + additem(ps, newifloat(img, align), tok); + // if no hspace specified, use FLTIMGHSPACE + if(!_tokaval(tok, Ahspace, &val, 0)) + ((Iimage*)img)->hspace = FLTIMGHSPACE; + } + else { + ps->skipwhite = 0; + additem(ps, img, tok); + } + if(!ps->skipping) { + ((Iimage*)img)->nextimage = di->images; + di->images = (Iimage*)img; + } + ps->curanchor = oldcuranchor; + break; + + // <!ELEMENT INPUT - O EMPTY> + case Tinput: + ps->skipwhite = 0; + if(is->curform == nil) { + if(warn) + fprint(2, "<INPUT> not inside <FORM>\n"); + continue; + } + is->curform->fields = field = newformfield( + atabval(tok, Atype, input_tab, NINPUTTAB, Ftext), + ++is->curform->nfields, + is->curform, + aval(tok, Aname), + aval(tok, Avalue), + auintval(tok, Asize, 0), + auintval(tok, Amaxlength, 1000), + is->curform->fields); + if(aflagval(tok, Achecked)) + field->flags = FFchecked; + + switch(field->ftype) { + case Ftext: + case Fpassword: + case Ffile: + if(field->size == 0) + field->size = 20; + break; + + case Fcheckbox: + if(field->name == nil) { + if(warn) + fprint(2, "warning: checkbox form field missing name\n"); + continue; + } + if(field->value == nil) + field->value = _Strdup(L(Lone)); + break; + + case Fradio: + if(field->name == nil || field->value == nil) { + if(warn) + fprint(2, "warning: radio form field missing name or value\n"); + continue; + } + break; + + case Fsubmit: + if(field->value == nil) + field->value = _Strdup(L(Lsubmit)); + if(field->name == nil) + field->name = _Strdup(L(Lnoname)); + break; + + case Fimage: + src = aurlval(tok, Asrc, nil, di->base); + if(src == nil) { + if(warn) + fprint(2, "warning: image form field missing src\n"); + continue; + } + // width and height attrs aren't specified in HTML 3.2, + // but some people provide them and they help avoid + // a relayout + field->image = newiimage(src, + astrval(tok, Aalt, L(Lsubmit)), + atabval(tok, Aalign, align_tab, NALIGNTAB, ALbottom), + auintval(tok, Awidth, 0), auintval(tok, Aheight, 0), + 0, 0, 0, 0, nil); + ii = (Iimage*)field->image; + ii->nextimage = di->images; + di->images = ii; + break; + + case Freset: + if(field->value == nil) + field->value = _Strdup(L(Lreset)); + break; + + case Fbutton: + if(field->value == nil) + field->value = _Strdup(L(Lspace)); + break; + } + ffit = newiformfield(field); + additem(ps, ffit, tok); + if(ffit->genattr != nil) + field->events = ffit->genattr->events; + break; + + // <!ENTITY ISINDEX - O EMPTY> + case Tisindex: + ps->skipwhite = 0; + prompt = astrval(tok, Aprompt, L(Lindex)); + target = atargval(tok, di->target); + additem(ps, textit(ps, prompt), tok); + frm = newform(++is->nforms, + nil, + di->base, + target, + HGet, + di->forms); + di->forms = frm; + ff = newformfield(Ftext, + 1, + frm, + _Strdup(L(Lisindex)), + nil, + 50, + 1000, + nil); + frm->fields = ff; + frm->nfields = 1; + additem(ps, newiformfield(ff), tok); + addbrk(ps, 1, 0); + break; + + // <!ELEMENT LI - O %flow> + case Tli: + if(ps->listtypestk.n == 0) { + if(warn) + fprint(2, "<LI> not in list\n"); + continue; + } + ty = top(&ps->listtypestk, 0); + ty2 = listtyval(tok, ty); + if(ty != ty2) { + ty = ty2; + push(&ps->listtypestk, ty2); + } + v = aintval(tok, Avalue, top(&ps->listcntstk, 1)); + if(ty == LTdisc || ty == LTsquare || ty == LTcircle) + hang = 10*LISTTAB - 3; + else + hang = 10*LISTTAB - 1; + changehang(ps, hang); + addtext(ps, listmark(ty, v)); + push(&ps->listcntstk, v + 1); + changehang(ps, -hang); + ps->skipwhite = 1; + break; + + // <!ELEMENT MAP - - (AREA)+> + case Tmap: + if(_tokaval(tok, Aname, &name, 0)) + is->curmap = getmap(di, name); + break; + + case Tmap+RBRA: + map = is->curmap; + if(map == nil) { + if(warn) + fprint(2, "warning: unexpected </MAP>\n"); + continue; + } + map->areas = (Area*)_revlist((List*)map->areas); + break; + + case Tmeta: + if(ps->skipping) + continue; + if(_tokaval(tok, Ahttp_equiv, &equiv, 0)) { + val = aval(tok, Acontent); + n = _Strlen(equiv); + if(!_Strncmpci(equiv, n, L(Lrefresh))) + di->refresh = val; + else if(!_Strncmpci(equiv, n, L(Lcontent))) { + n = _Strlen(val); + if(!_Strncmpci(val, n, L(Ljavascript)) + || !_Strncmpci(val, n, L(Ljscript1)) + || !_Strncmpci(val, n, L(Ljscript))) + di->scripttype = TextJavascript; + else { + if(warn) + fprint(2, "unimplemented script type %S\n", val); + di->scripttype = UnknownType; + } + } + } + break; + + // Nobr is NOT in HMTL 4.0, but it is ubiquitous on the web + case Tnobr: + ps->skipwhite = 0; + ps->curstate &= ~IFwrap; + break; + + case Tnobr+RBRA: + ps->curstate |= IFwrap; + break; + + // We do frames, so skip stuff in noframes + case Tnoframes: + ps->skipping = 1; + break; + + case Tnoframes+RBRA: + ps->skipping = 0; + break; + + // We do scripts (if enabled), so skip stuff in noscripts + case Tnoscript: + if(doscripts) + ps->skipping = 1; + break; + + case Tnoscript+RBRA: + if(doscripts) + ps->skipping = 0; + break; + + // <!ELEMENT OPTION - O ( //PCDATA)> + case Toption: + if(is->curform == nil || is->curform->fields == nil) { + if(warn) + fprint(2, "warning: <OPTION> not in <SELECT>\n"); + continue; + } + field = is->curform->fields; + if(field->ftype != Fselect) { + if(warn) + fprint(2, "warning: <OPTION> not in <SELECT>\n"); + continue; + } + val = aval(tok, Avalue); + option = newoption(aflagval(tok, Aselected), val, nil, field->options); + field->options = option; + option->display = getpcdata(toks, tokslen, &toki); + if(val == nil) + option->value = _Strdup(option->display); + break; + + // <!ELEMENT P - O (%text)* > + case Tp: + pushjust(ps, atabval(tok, Aalign, align_tab, NALIGNTAB, ps->curjust)); + ps->inpar = 1; + ps->skipwhite = 1; + break; + + case Tp+RBRA: + break; + + // <!ELEMENT PARAM - O EMPTY> + // Do something when we do applets... + case Tparam: + break; + + // <!ELEMENT PRE - - (%text)* -(IMG|BIG|SMALL|SUB|SUP|FONT) > + case Tpre: + ps->curstate &= ~IFwrap; + ps->literal = 1; + ps->skipwhite = 0; + pushfontstyle(ps, FntT); + break; + + case Tpre+RBRA: + ps->curstate |= IFwrap; + if(ps->literal) { + popfontstyle(ps); + ps->literal = 0; + } + break; + + // <!ELEMENT SCRIPT - - CDATA> + case Tscript: + if(doscripts) { + if(!di->hasscripts) { + if(di->scripttype == TextJavascript) { + // TODO: initialize script if nec. + // initjscript(di); + di->hasscripts = 1; + } + } + } + if(!di->hasscripts) { + if(warn) + fprint(2, "warning: <SCRIPT> ignored\n"); + ps->skipping = 1; + } + else { + scriptsrc = aurlval(tok, Asrc, nil, di->base); + script = nil; + if(scriptsrc != nil) { + if(warn) + fprint(2, "warning: non-local <SCRIPT> ignored\n"); + free(scriptsrc); + } + else { + script = getpcdata(toks, tokslen, &toki); + } + if(script != nil) { + if(warn) + fprint(2, "script ignored\n"); + free(script); + } + } + break; + + case Tscript+RBRA: + ps->skipping = 0; + break; + + // <!ELEMENT SELECT - - (OPTION+)> + case Tselect: + if(is->curform == nil) { + if(warn) + fprint(2, "<SELECT> not inside <FORM>\n"); + continue; + } + field = newformfield(Fselect, + ++is->curform->nfields, + is->curform, + aval(tok, Aname), + nil, + auintval(tok, Asize, 0), + 0, + is->curform->fields); + is->curform->fields = field; + if(aflagval(tok, Amultiple)) + field->flags = FFmultiple; + ffit = newiformfield(field); + additem(ps, ffit, tok); + if(ffit->genattr != nil) + field->events = ffit->genattr->events; + // throw away stuff until next tag (should be <OPTION>) + s = getpcdata(toks, tokslen, &toki); + if(s != nil) + free(s); + break; + + case Tselect+RBRA: + if(is->curform == nil || is->curform->fields == nil) { + if(warn) + fprint(2, "warning: unexpected </SELECT>\n"); + continue; + } + field = is->curform->fields; + if(field->ftype != Fselect) + continue; + // put options back in input order + field->options = (Option*)_revlist((List*)field->options); + break; + + // <!ELEMENT (STRIKE|U) - - (%text)*> + case Tstrike: + case Tu: + ps->curul = push(&ps->ulstk, (tag==Tstrike)? ULmid : ULunder); + break; + + case Tstrike+RBRA: + case Tu+RBRA: + if(ps->ulstk.n == 0) { + if(warn) + fprint(2, "warning: unexpected %T\n", tok); + continue; + } + ps->curul = popretnewtop(&ps->ulstk, ULnone); + break; + + // <!ELEMENT STYLE - - CDATA> + case Tstyle: + if(warn) + fprint(2, "warning: unimplemented <STYLE>\n"); + ps->skipping = 1; + break; + + case Tstyle+RBRA: + ps->skipping = 0; + break; + + // <!ELEMENT (SUB|SUP) - - (%text)*> + case Tsub: + case Tsup: + if(tag == Tsub) + ps->curvoff += SUBOFF; + else + ps->curvoff -= SUPOFF; + push(&ps->voffstk, ps->curvoff); + sz = top(&ps->fntsizestk, Normal); + pushfontsize(ps, sz - 1); + break; + + case Tsub+RBRA: + case Tsup+RBRA: + if(ps->voffstk.n == 0) { + if(warn) + fprint(2, "warning: unexpected %T\n", tok); + continue; + } + ps->curvoff = popretnewtop(&ps->voffstk, 0); + popfontsize(ps); + break; + + // <!ELEMENT TABLE - - (CAPTION?, TR+)> + case Ttable: + ps->skipwhite = 0; + tab = newtable(++is->ntables, + aalign(tok), + adimen(tok, Awidth), + aflagval(tok, Aborder), + auintval(tok, Acellspacing, TABSP), + auintval(tok, Acellpadding, TABPAD), + makebackground(nil, acolorval(tok, Abgcolor, ps->curbg.color)), + tok, + is->tabstk); + is->tabstk = tab; + curtab = tab; + break; + + case Ttable+RBRA: + if(curtab == nil) { + if(warn) + fprint(2, "warning: unexpected </TABLE>\n"); + continue; + } + isempty = (curtab->cells == nil); + if(isempty) { + if(warn) + fprint(2, "warning: <TABLE> has no cells\n"); + } + else { + ps = finishcell(curtab, ps); + if(curtab->rows != nil) + curtab->rows->flags = 0; + finish_table(curtab); + } + ps->skipping = 0; + if(!isempty) { + tabitem = newitable(curtab); + al = curtab->align.halign; + switch(al) { + case ALleft: + case ALright: + additem(ps, newifloat(tabitem, al), tok); + break; + default: + if(al == ALcenter) + pushjust(ps, ALcenter); + addbrk(ps, 0, 0); + if(ps->inpar) { + popjust(ps); + ps->inpar = 0; + } + additem(ps, tabitem, curtab->tabletok); + if(al == ALcenter) + popjust(ps); + break; + } + } + if(is->tabstk == nil) { + if(warn) + fprint(2, "warning: table stack is wrong\n"); + } + else + is->tabstk = is->tabstk->next; + curtab->next = di->tables; + di->tables = curtab; + curtab = is->tabstk; + if(!isempty) + addbrk(ps, 0, 0); + break; + + // <!ELEMENT (TH|TD) - O %body.content> + // Cells for a row are accumulated in reverse order. + // We push ps on a stack, and use a new one to accumulate + // the contents of the cell. + case Ttd: + case Tth: + if(curtab == nil) { + if(warn) + fprint(2, "%T outside <TABLE>\n", tok); + continue; + } + if(ps->inpar) { + popjust(ps); + ps->inpar = 0; + } + ps = finishcell(curtab, ps); + tr = nil; + if(curtab->rows != nil) + tr = curtab->rows; + if(tr == nil || !tr->flags) { + if(warn) + fprint(2, "%T outside row\n", tok); + tr = newtablerow(makealign(ALnone, ALnone), + makebackground(nil, curtab->background.color), + TFparsing, + curtab->rows); + curtab->rows = tr; + } + ps = cell_pstate(ps, tag == Tth); + flags = TFparsing; + if(aflagval(tok, Anowrap)) { + flags |= TFnowrap; + ps->curstate &= ~IFwrap; + } + if(tag == Tth) + flags |= TFisth; + c = newtablecell(curtab->cells==nil? 1 : curtab->cells->cellid+1, + auintval(tok, Arowspan, 1), + auintval(tok, Acolspan, 1), + aalign(tok), + adimen(tok, Awidth), + auintval(tok, Aheight, 0), + makebackground(nil, acolorval(tok, Abgcolor, tr->background.color)), + flags, + curtab->cells); + curtab->cells = c; + ps->curbg = c->background; + if(c->align.halign == ALnone) { + if(tr->align.halign != ALnone) + c->align.halign = tr->align.halign; + else if(tag == Tth) + c->align.halign = ALcenter; + else + c->align.halign = ALleft; + } + if(c->align.valign == ALnone) { + if(tr->align.valign != ALnone) + c->align.valign = tr->align.valign; + else + c->align.valign = ALmiddle; + } + c->nextinrow = tr->cells; + tr->cells = c; + break; + + case Ttd+RBRA: + case Tth+RBRA: + if(curtab == nil || curtab->cells == nil) { + if(warn) + fprint(2, "unexpected %T\n", tok); + continue; + } + ps = finishcell(curtab, ps); + break; + + // <!ELEMENT TEXTAREA - - ( //PCDATA)> + case Ttextarea: + if(is->curform == nil) { + if(warn) + fprint(2, "<TEXTAREA> not inside <FORM>\n"); + continue; + } + field = newformfield(Ftextarea, + ++is->curform->nfields, + is->curform, + aval(tok, Aname), + nil, + 0, + 0, + is->curform->fields); + is->curform->fields = field; + field->rows = auintval(tok, Arows, 3); + field->cols = auintval(tok, Acols, 50); + field->value = getpcdata(toks, tokslen, &toki); + if(warn && toki < tokslen - 1 && toks[toki + 1].tag != Ttextarea + RBRA) + fprint(2, "warning: <TEXTAREA> data ended by %T\n", &toks[toki + 1]); + ffit = newiformfield(field); + additem(ps, ffit, tok); + if(ffit->genattr != nil) + field->events = ffit->genattr->events; + break; + + // <!ELEMENT TITLE - - ( //PCDATA)* -(%head.misc)> + case Ttitle: + di->doctitle = getpcdata(toks, tokslen, &toki); + if(warn && toki < tokslen - 1 && toks[toki + 1].tag != Ttitle + RBRA) + fprint(2, "warning: <TITLE> data ended by %T\n", &toks[toki + 1]); + break; + + // <!ELEMENT TR - O (TH|TD)+> + // rows are accumulated in reverse order in curtab->rows + case Ttr: + if(curtab == nil) { + if(warn) + fprint(2, "warning: <TR> outside <TABLE>\n"); + continue; + } + if(ps->inpar) { + popjust(ps); + ps->inpar = 0; + } + ps = finishcell(curtab, ps); + if(curtab->rows != nil) + curtab->rows->flags = 0; + curtab->rows = newtablerow(aalign(tok), + makebackground(nil, acolorval(tok, Abgcolor, curtab->background.color)), + TFparsing, + curtab->rows); + break; + + case Ttr+RBRA: + if(curtab == nil || curtab->rows == nil) { + if(warn) + fprint(2, "warning: unexpected </TR>\n"); + continue; + } + ps = finishcell(curtab, ps); + tr = curtab->rows; + if(tr->cells == nil) { + if(warn) + fprint(2, "warning: empty row\n"); + curtab->rows = tr->next; + tr->next = nil; + } + else + tr->flags = 0; + break; + + // <!ELEMENT (TT|CODE|KBD|SAMP) - - (%text)*> + case Ttt: + case Tcode: + case Tkbd: + case Tsamp: + pushfontstyle(ps, FntT); + break; + + // Tags that have empty action + case Tabbr: + case Tabbr+RBRA: + case Tacronym: + case Tacronym+RBRA: + case Tarea+RBRA: + case Tbase+RBRA: + case Tbasefont+RBRA: + case Tbr+RBRA: + case Tdd+RBRA: + case Tdt+RBRA: + case Tframe+RBRA: + case Thr+RBRA: + case Thtml: + case Thtml+RBRA: + case Timg+RBRA: + case Tinput+RBRA: + case Tisindex+RBRA: + case Tli+RBRA: + case Tlink: + case Tlink+RBRA: + case Tmeta+RBRA: + case Toption+RBRA: + case Tparam+RBRA: + case Ttextarea+RBRA: + case Ttitle+RBRA: + break; + + + // Tags not implemented + case Tbdo: + case Tbdo+RBRA: + case Tbutton: + case Tbutton+RBRA: + case Tdel: + case Tdel+RBRA: + case Tfieldset: + case Tfieldset+RBRA: + case Tiframe: + case Tiframe+RBRA: + case Tins: + case Tins+RBRA: + case Tlabel: + case Tlabel+RBRA: + case Tlegend: + case Tlegend+RBRA: + case Tobject: + case Tobject+RBRA: + case Toptgroup: + case Toptgroup+RBRA: + case Tspan: + case Tspan+RBRA: + if(warn) { + if(tag > RBRA) + tag -= RBRA; + fprint(2, "warning: unimplemented HTML tag: %S\n", tagnames[tag]); + } + break; + + default: + if(warn) + fprint(2, "warning: unknown HTML tag: %S\n", tok->text); + break; + } + } + // some pages omit trailing </table> + while(curtab != nil) { + if(warn) + fprint(2, "warning: <TABLE> not closed\n"); + if(curtab->cells != nil) { + ps = finishcell(curtab, ps); + if(curtab->cells == nil) { + if(warn) + fprint(2, "warning: empty table\n"); + } + else { + if(curtab->rows != nil) + curtab->rows->flags = 0; + finish_table(curtab); + ps->skipping = 0; + additem(ps, newitable(curtab), curtab->tabletok); + addbrk(ps, 0, 0); + } + } + if(is->tabstk != nil) + is->tabstk = is->tabstk->next; + curtab->next = di->tables; + di->tables = curtab; + curtab = is->tabstk; + } + outerps = lastps(ps); + ans = outerps->items->next; + // note: ans may be nil and di->kids not nil, if there's a frameset! + outerps->items = newispacer(ISPnull); + outerps->lastit = outerps->items; + is->psstk = ps; + if(ans != nil && di->hasscripts) { + // TODO evalscript(nil); + ; + } + +return_ans: + if(dbgbuild) { + assert(validitems(ans)); + if(ans == nil) + fprint(2, "getitems returning nil\n"); + else + printitems(ans, "getitems returning:"); + } + return ans; +} + +// Concatenate together maximal set of Data tokens, starting at toks[toki+1]. +// Lexer has ensured that there will either be a following non-data token or +// we will be at eof. +// Return emallocd trimmed concatenation, and update *ptoki to last used toki +static Rune* +getpcdata(Token* toks, int tokslen, int* ptoki) +{ + Rune* ans; + Rune* p; + Rune* trimans; + int anslen; + int trimanslen; + int toki; + Token* tok; + + ans = nil; + anslen = 0; + // first find length of answer + toki = (*ptoki) + 1; + while(toki < tokslen) { + tok = &toks[toki]; + if(tok->tag == Data) { + toki++; + anslen += _Strlen(tok->text); + } + else + break; + } + // now make up the initial answer + if(anslen > 0) { + ans = _newstr(anslen); + p = ans; + toki = (*ptoki) + 1; + while(toki < tokslen) { + tok = &toks[toki]; + if(tok->tag == Data) { + toki++; + p = _Stradd(p, tok->text, _Strlen(tok->text)); + } + else + break; + } + *p = 0; + _trimwhite(ans, anslen, &trimans, &trimanslen); + if(trimanslen != anslen) { + p = ans; + ans = _Strndup(trimans, trimanslen); + free(p); + } + } + *ptoki = toki-1; + return ans; +} + +// If still parsing head of curtab->cells list, finish it off +// by transferring the items on the head of psstk to the cell. +// Then pop the psstk and return the new psstk. +static Pstate* +finishcell(Table* curtab, Pstate* psstk) +{ + Tablecell* c; + Pstate* psstknext; + + c = curtab->cells; + if(c != nil) { + if((c->flags&TFparsing)) { + psstknext = psstk->next; + if(psstknext == nil) { + if(warn) + fprint(2, "warning: parse state stack is wrong\n"); + } + else { + c->content = psstk->items->next; + c->flags &= ~TFparsing; + freepstate(psstk); + psstk = psstknext; + } + } + } + return psstk; +} + +// Make a new Pstate for a cell, based on the old pstate, oldps. +// Also, put the new ps on the head of the oldps stack. +static Pstate* +cell_pstate(Pstate* oldps, int ishead) +{ + Pstate* ps; + int sty; + + ps = newpstate(oldps); + ps->skipwhite = 1; + ps->curanchor = oldps->curanchor; + copystack(&ps->fntstylestk, &oldps->fntstylestk); + copystack(&ps->fntsizestk, &oldps->fntsizestk); + ps->curfont = oldps->curfont; + ps->curfg = oldps->curfg; + ps->curbg = oldps->curbg; + copystack(&ps->fgstk, &oldps->fgstk); + ps->adjsize = oldps->adjsize; + if(ishead) { + sty = ps->curfont%NumSize; + ps->curfont = FntB*NumSize + sty; + } + return ps; +} + +// Return a new Pstate with default starting state. +// Use link to add it to head of a list, if any. +static Pstate* +newpstate(Pstate* link) +{ + Pstate* ps; + + ps = (Pstate*)emalloc(sizeof(Pstate)); + ps->curfont = DefFnt; + ps->curfg = Black; + ps->curbg.image = nil; + ps->curbg.color = White; + ps->curul = ULnone; + ps->curjust = ALleft; + ps->curstate = IFwrap; + ps->items = newispacer(ISPnull); + ps->lastit = ps->items; + ps->prelastit = nil; + ps->next = link; + return ps; +} + +// Return last Pstate on psl list +static Pstate* +lastps(Pstate* psl) +{ + assert(psl != nil); + while(psl->next != nil) + psl = psl->next; + return psl; +} + +// Add it to end of ps item chain, adding in current state from ps. +// Also, if tok is not nil, scan it for generic attributes and assign +// the genattr field of the item accordingly. +static void +additem(Pstate* ps, Item* it, Token* tok) +{ + int aid; + int any; + Rune* i; + Rune* c; + Rune* s; + Rune* t; + Attr* a; + SEvent* e; + + if(ps->skipping) { + if(warn) + fprint(2, "warning: skipping item: %I\n", it); + return; + } + it->anchorid = ps->curanchor; + it->state |= ps->curstate; + if(tok != nil) { + any = 0; + i = nil; + c = nil; + s = nil; + t = nil; + e = nil; + for(a = tok->attr; a != nil; a = a->next) { + aid = a->attid; + if(!attrinfo[aid]) + continue; + switch(aid) { + case Aid: + i = a->value; + break; + + case Aclass: + c = a->value; + break; + + case Astyle: + s = a->value; + break; + + case Atitle: + t = a->value; + break; + + default: + assert(aid >= Aonblur && aid <= Aonunload); + e = newscriptevent(scriptev[a->attid], a->value, e); + break; + } + a->value = nil; + any = 1; + } + if(any) + it->genattr = newgenattr(i, c, s, t, e); + } + ps->curstate &= ~(IFbrk|IFbrksp|IFnobrk|IFcleft|IFcright); + ps->prelastit = ps->lastit; + ps->lastit->next = it; + ps->lastit = it; +} + +// Make a text item out of s, +// using current font, foreground, vertical offset and underline state. +static Item* +textit(Pstate* ps, Rune* s) +{ + assert(s != nil); + return newitext(s, ps->curfont, ps->curfg, ps->curvoff + Voffbias, ps->curul); +} + +// Add text item or items for s, paying attention to +// current font, foreground, baseline offset, underline state, +// and literal mode. Unless we're in literal mode, compress +// whitespace to single blank, and, if curstate has a break, +// trim any leading whitespace. Whether in literal mode or not, +// turn nonbreaking spaces into spacer items with IFnobrk set. +// +// In literal mode, break up s at newlines and add breaks instead. +// Also replace tabs appropriate number of spaces. +// In nonliteral mode, break up the items every 100 or so characters +// just to make the layout algorithm not go quadratic. +// +// addtext assumes ownership of s. +static void +addtext(Pstate* ps, Rune* s) +{ + int n; + int i; + int j; + int k; + int col; + int c; + int nsp; + Item* it; + Rune* ss; + Rune* p; + Rune buf[SMALLBUFSIZE]; + + assert(s != nil); + n = runestrlen(s); + i = 0; + j = 0; + if(ps->literal) { + col = 0; + while(i < n) { + if(s[i] == '\n') { + if(i > j) { + // trim trailing blanks from line + for(k = i; k > j; k--) + if(s[k - 1] != ' ') + break; + if(k > j) + additem(ps, textit(ps, _Strndup(s+j, k-j)), nil); + } + addlinebrk(ps, 0); + j = i + 1; + col = 0; + } + else { + if(s[i] == '\t') { + col += i - j; + nsp = 8 - (col%8); + // make ss = s[j:i] + nsp spaces + ss = _newstr(i-j+nsp); + p = _Stradd(ss, s+j, i-j); + p = _Stradd(p, L(Ltab2space), nsp); + *p = 0; + additem(ps, textit(ps, ss), nil); + col += nsp; + j = i + 1; + } + else if(s[i] == NBSP) { + if(i > j) + additem(ps, textit(ps, _Strndup(s+j, i-j)), nil); + addnbsp(ps); + col += (i - j) + 1; + j = i + 1; + } + } + i++; + } + if(i > j) { + if(j == 0 && i == n) { + // just transfer s over + additem(ps, textit(ps, s), nil); + } + else { + additem(ps, textit(ps, _Strndup(s+j, i-j)), nil); + free(s); + } + } + } + else { // not literal mode + if((ps->curstate&IFbrk) || ps->lastit == ps->items) + while(i < n) { + c = s[i]; + if(c >= 256 || !isspace(c)) + break; + i++; + } + p = buf; + for(j = i; i < n; i++) { + assert(p+i-j < buf+SMALLBUFSIZE-1); + c = s[i]; + if(c == NBSP) { + if(i > j) + p = _Stradd(p, s+j, i-j); + if(p > buf) + additem(ps, textit(ps, _Strndup(buf, p-buf)), nil); + p = buf; + addnbsp(ps); + j = i + 1; + continue; + } + if(c < 256 && isspace(c)) { + if(i > j) + p = _Stradd(p, s+j, i-j); + *p++ = ' '; + while(i < n - 1) { + c = s[i + 1]; + if(c >= 256 || !isspace(c)) + break; + i++; + } + j = i + 1; + } + if(i - j >= 100) { + p = _Stradd(p, s+j, i+1-j); + j = i + 1; + } + if(p-buf >= 100) { + additem(ps, textit(ps, _Strndup(buf, p-buf)), nil); + p = buf; + } + } + if(i > j && j < n) { + assert(p+i-j < buf+SMALLBUFSIZE-1); + p = _Stradd(p, s+j, i-j); + } + // don't add a space if previous item ended in a space + if(p-buf == 1 && buf[0] == ' ' && ps->lastit != nil) { + it = ps->lastit; + if(it->tag == Itexttag) { + ss = ((Itext*)it)->s; + k = _Strlen(ss); + if(k > 0 && ss[k] == ' ') + p = buf; + } + } + if(p > buf) + additem(ps, textit(ps, _Strndup(buf, p-buf)), nil); + free(s); + } +} + +// Add a break to ps->curstate, with extra space if sp is true. +// If there was a previous break, combine this one's parameters +// with that to make the amt be the max of the two and the clr +// be the most general. (amt will be 0 or 1) +// Also, if the immediately preceding item was a text item, +// trim any whitespace from the end of it, if not in literal mode. +// Finally, if this is at the very beginning of the item list +// (the only thing there is a null spacer), then don't add the space. +static void +addbrk(Pstate* ps, int sp, int clr) +{ + int state; + Rune* l; + int nl; + Rune* r; + int nr; + Itext* t; + Rune* s; + + state = ps->curstate; + clr = clr|(state&(IFcleft|IFcright)); + if(sp && !(ps->lastit == ps->items)) + sp = IFbrksp; + else + sp = 0; + ps->curstate = IFbrk|sp|(state&~(IFcleft|IFcright))|clr; + if(ps->lastit != ps->items) { + if(!ps->literal && ps->lastit->tag == Itexttag) { + t = (Itext*)ps->lastit; + _splitr(t->s, _Strlen(t->s), notwhitespace, &l, &nl, &r, &nr); + // try to avoid making empty items + // but not crucial f the occasional one gets through + if(nl == 0 && ps->prelastit != nil) { + ps->lastit = ps->prelastit; + ps->lastit->next = nil; + ps->prelastit = nil; + } + else { + s = t->s; + if(nl == 0) { + // need a non-nil pointer to empty string + // (_Strdup(L(Lempty)) returns nil) + t->s = emalloc(sizeof(Rune)); + t->s[0] = 0; + } + else + t->s = _Strndup(l, nl); + if(s) + free(s); + } + } + } +} + +// Add break due to a <br> or a newline within a preformatted section. +// We add a null item first, with current font's height and ascent, to make +// sure that the current line takes up at least that amount of vertical space. +// This ensures that <br>s on empty lines cause blank lines, and that +// multiple <br>s in a row give multiple blank lines. +// However don't add the spacer if the previous item was something that +// takes up space itself. +static void +addlinebrk(Pstate* ps, int clr) +{ + int obrkstate; + int b; + int addit; + + // don't want break before our null item unless the previous item + // was also a null item for the purposes of line breaking + obrkstate = ps->curstate&(IFbrk|IFbrksp); + b = IFnobrk; + addit = 0; + if(ps->lastit != nil) { + if(ps->lastit->tag == Ispacertag) { + if(((Ispacer*)ps->lastit)->spkind == ISPvline) + b = IFbrk; + addit = 1; + } + else if(ps->lastit->tag == Ifloattag) + addit = 1; + } + if(addit) { + ps->curstate = (ps->curstate&~(IFbrk|IFbrksp))|b; + additem(ps, newispacer(ISPvline), nil); + ps->curstate = (ps->curstate&~(IFbrk|IFbrksp))|obrkstate; + } + addbrk(ps, 0, clr); +} + +// Add a nonbreakable space +static void +addnbsp(Pstate* ps) +{ + // if nbsp comes right where a break was specified, + // do the break anyway (nbsp is being used to generate undiscardable + // space rather than to prevent a break) + if((ps->curstate&IFbrk) == 0) + ps->curstate |= IFnobrk; + additem(ps, newispacer(ISPhspace), nil); + // but definitely no break on next item + ps->curstate |= IFnobrk; +} + +// Change hang in ps.curstate by delta. +// The amount is in 1/10ths of tabs, and is the amount that +// the current contiguous set of items with a hang value set +// is to be shifted left from its normal (indented) place. +static void +changehang(Pstate* ps, int delta) +{ + int amt; + + amt = (ps->curstate&IFhangmask) + delta; + if(amt < 0) { + if(warn) + fprint(2, "warning: hang went negative\n"); + amt = 0; + } + ps->curstate = (ps->curstate&~IFhangmask)|amt; +} + +// Change indent in ps.curstate by delta. +static void +changeindent(Pstate* ps, int delta) +{ + int amt; + + amt = ((ps->curstate&IFindentmask) >> IFindentshift) + delta; + if(amt < 0) { + if(warn) + fprint(2, "warning: indent went negative\n"); + amt = 0; + } + ps->curstate = (ps->curstate&~IFindentmask)|(amt << IFindentshift); +} + +// Push val on top of stack, and also return value pushed +static int +push(Stack* stk, int val) +{ + if(stk->n == Nestmax) { + if(warn) + fprint(2, "warning: build stack overflow\n"); + } + else + stk->slots[stk->n++] = val; + return val; +} + +// Pop top of stack +static void +pop(Stack* stk) +{ + if(stk->n > 0) + --stk->n; +} + +//Return top of stack, using dflt if stack is empty +static int +top(Stack* stk, int dflt) +{ + if(stk->n == 0) + return dflt; + return stk->slots[stk->n-1]; +} + +// pop, then return new top, with dflt if empty +static int +popretnewtop(Stack* stk, int dflt) +{ + if(stk->n == 0) + return dflt; + stk->n--; + if(stk->n == 0) + return dflt; + return stk->slots[stk->n-1]; +} + +// Copy fromstk entries into tostk +static void +copystack(Stack* tostk, Stack* fromstk) +{ + int n; + + n = fromstk->n; + tostk->n = n; + memmove(tostk->slots, fromstk->slots, n*sizeof(int)); +} + +static void +popfontstyle(Pstate* ps) +{ + pop(&ps->fntstylestk); + setcurfont(ps); +} + +static void +pushfontstyle(Pstate* ps, int sty) +{ + push(&ps->fntstylestk, sty); + setcurfont(ps); +} + +static void +popfontsize(Pstate* ps) +{ + pop(&ps->fntsizestk); + setcurfont(ps); +} + +static void +pushfontsize(Pstate* ps, int sz) +{ + push(&ps->fntsizestk, sz); + setcurfont(ps); +} + +static void +setcurfont(Pstate* ps) +{ + int sty; + int sz; + + sty = top(&ps->fntstylestk, FntR); + sz = top(&ps->fntsizestk, Normal); + if(sz < Tiny) + sz = Tiny; + if(sz > Verylarge) + sz = Verylarge; + ps->curfont = sty*NumSize + sz; +} + +static void +popjust(Pstate* ps) +{ + pop(&ps->juststk); + setcurjust(ps); +} + +static void +pushjust(Pstate* ps, int j) +{ + push(&ps->juststk, j); + setcurjust(ps); +} + +static void +setcurjust(Pstate* ps) +{ + int j; + int state; + + j = top(&ps->juststk, ALleft); + if(j != ps->curjust) { + ps->curjust = j; + state = ps->curstate; + state &= ~(IFrjust|IFcjust); + if(j == ALcenter) + state |= IFcjust; + else if(j == ALright) + state |= IFrjust; + ps->curstate = state; + } +} + +// Do final rearrangement after table parsing is finished +// and assign cells to grid points +static void +finish_table(Table* t) +{ + int ncol; + int nrow; + int r; + Tablerow* rl; + Tablecell* cl; + int* rowspancnt; + Tablecell** rowspancell; + int ri; + int ci; + Tablecell* c; + Tablecell* cnext; + Tablerow* row; + Tablerow* rownext; + int rcols; + int newncol; + int k; + int j; + int cspan; + int rspan; + int i; + + rl = t->rows; + t->nrow = nrow = _listlen((List*)rl); + t->rows = (Tablerow*)emalloc(nrow * sizeof(Tablerow)); + ncol = 0; + r = nrow - 1; + for(row = rl; row != nil; row = rownext) { + // copy the data from the allocated Tablerow into the array slot + t->rows[r] = *row; + rownext = row->next; + row = &t->rows[r]; + r--; + rcols = 0; + c = row->cells; + + // If rowspan is > 1 but this is the last row, + // reset the rowspan + if(c != nil && c->rowspan > 1 && r == nrow-2) + c->rowspan = 1; + + // reverse row->cells list (along nextinrow pointers) + row->cells = nil; + while(c != nil) { + cnext = c->nextinrow; + c->nextinrow = row->cells; + row->cells = c; + rcols += c->colspan; + c = cnext; + } + if(rcols > ncol) + ncol = rcols; + } + t->ncol = ncol; + t->cols = (Tablecol*)emalloc(ncol * sizeof(Tablecol)); + + // Reverse cells just so they are drawn in source order. + // Also, trim their contents so they don't end in whitespace. + t->cells = (Tablecell*)_revlist((List*)t->cells); + for(c = t->cells; c != nil; c= c->next) + trim_cell(c); + t->grid = (Tablecell***)emalloc(nrow * sizeof(Tablecell**)); + for(i = 0; i < nrow; i++) + t->grid[i] = (Tablecell**)emalloc(ncol * sizeof(Tablecell*)); + + // The following arrays keep track of cells that are spanning + // multiple rows; rowspancnt[i] is the number of rows left + // to be spanned in column i. + // When done, cell's (row,col) is upper left grid point. + rowspancnt = (int*)emalloc(ncol * sizeof(int)); + rowspancell = (Tablecell**)emalloc(ncol * sizeof(Tablecell*)); + for(ri = 0; ri < nrow; ri++) { + row = &t->rows[ri]; + cl = row->cells; + ci = 0; + while(ci < ncol || cl != nil) { + if(ci < ncol && rowspancnt[ci] > 0) { + t->grid[ri][ci] = rowspancell[ci]; + rowspancnt[ci]--; + ci++; + } + else { + if(cl == nil) { + ci++; + continue; + } + c = cl; + cl = cl->nextinrow; + cspan = c->colspan; + rspan = c->rowspan; + if(ci + cspan > ncol) { + // because of row spanning, we calculated + // ncol incorrectly; adjust it + newncol = ci + cspan; + t->cols = (Tablecol*)erealloc(t->cols, newncol * sizeof(Tablecol)); + rowspancnt = (int*)erealloc(rowspancnt, newncol * sizeof(int)); + rowspancell = (Tablecell**)erealloc(rowspancell, newncol * sizeof(Tablecell*)); + k = newncol-ncol; + memset(t->cols+ncol, 0, k*sizeof(Tablecol)); + memset(rowspancnt+ncol, 0, k*sizeof(int)); + memset(rowspancell+ncol, 0, k*sizeof(Tablecell*)); + for(j = 0; j < nrow; j++) { + t->grid[j] = (Tablecell**)erealloc(t->grid[j], newncol * sizeof(Tablecell*)); + memset(t->grid[j], 0, k*sizeof(Tablecell*)); + } + t->ncol = ncol = newncol; + } + c->row = ri; + c->col = ci; + for(i = 0; i < cspan; i++) { + t->grid[ri][ci] = c; + if(rspan > 1) { + rowspancnt[ci] = rspan - 1; + rowspancell[ci] = c; + } + ci++; + } + } + } + } +} + +// Remove tail of cell content until it isn't whitespace. +static void +trim_cell(Tablecell* c) +{ + int dropping; + Rune* s; + Rune* x; + Rune* y; + int nx; + int ny; + Item* p; + Itext* q; + Item* pprev; + + dropping = 1; + while(c->content != nil && dropping) { + p = c->content; + pprev = nil; + while(p->next != nil) { + pprev = p; + p = p->next; + } + dropping = 0; + if(!(p->state&IFnobrk)) { + if(p->tag == Itexttag) { + q = (Itext*)p; + s = q->s; + _splitr(s, _Strlen(s), notwhitespace, &x, &nx, &y, &ny); + if(nx != 0 && ny != 0) { + q->s = _Strndup(x, nx); + free(s); + } + break; + } + } + if(dropping) { + if(pprev == nil) + c->content = nil; + else + pprev->next = nil; + freeitem(p); + } + } +} + +// Caller must free answer (eventually). +static Rune* +listmark(uchar ty, int n) +{ + Rune* s; + Rune* t; + int n2; + int i; + + s = nil; + switch(ty) { + case LTdisc: + case LTsquare: + case LTcircle: + s = _newstr(1); + s[0] = (ty == LTdisc)? 0x2022 // bullet + : ((ty == LTsquare)? 0x220e // filled square + : 0x2218); // degree + s[1] = 0; + break; + + case LT1: + t = _ltoStr(n); + n2 = _Strlen(t); + s = _newstr(n2+1); + t = _Stradd(s, t, n2); + *t++ = '.'; + *t = 0; + break; + + case LTa: + case LTA: + n--; + i = 0; + if(n < 0) + n = 0; + s = _newstr((n <= 25)? 2 : 3); + if(n > 25) { + n2 = n%26; + n /= 26; + if(n2 > 25) + n2 = 25; + s[i++] = n2 + (ty == LTa)? 'a' : 'A'; + } + s[i++] = n + (ty == LTa)? 'a' : 'A'; + s[i++] = '.'; + s[i] = 0; + break; + + case LTi: + case LTI: + if(n >= NROMAN) { + if(warn) + fprint(2, "warning: unimplemented roman number > %d\n", NROMAN); + n = NROMAN; + } + t = roman[n - 1]; + n2 = _Strlen(t); + s = _newstr(n2+1); + for(i = 0; i < n2; i++) + s[i] = (ty == LTi)? tolower(t[i]) : t[i]; + s[i++] = '.'; + s[i] = 0; + break; + } + return s; +} + +// Find map with given name in di.maps. +// If not there, add one, copying name. +// Ownership of map remains with di->maps list. +static Map* +getmap(Docinfo* di, Rune* name) +{ + Map* m; + + for(m = di->maps; m != nil; m = m->next) { + if(!_Strcmp(name, m->name)) + return m; + } + m = (Map*)emalloc(sizeof(Map)); + m->name = _Strdup(name); + m->areas = nil; + m->next = di->maps; + di->maps = m; + return m; +} + +// Transfers ownership of href to Area +static Area* +newarea(int shape, Rune* href, int target, Area* link) +{ + Area* a; + + a = (Area*)emalloc(sizeof(Area)); + a->shape = shape; + a->href = href; + a->target = target; + a->next = link; + return a; +} + +// Return string value associated with attid in tok, nil if none. +// Caller must free the result (eventually). +static Rune* +aval(Token* tok, int attid) +{ + Rune* ans; + + _tokaval(tok, attid, &ans, 1); // transfers string ownership from token to ans + return ans; +} + +// Like aval, but use dflt if there was no such attribute in tok. +// Caller must free the result (eventually). +static Rune* +astrval(Token* tok, int attid, Rune* dflt) +{ + Rune* ans; + + if(_tokaval(tok, attid, &ans, 1)) + return ans; // transfers string ownership from token to ans + else + return _Strdup(dflt); +} + +// Here we're supposed to convert to an int, +// and have a default when not found +static int +aintval(Token* tok, int attid, int dflt) +{ + Rune* ans; + + if(!_tokaval(tok, attid, &ans, 0) || ans == nil) + return dflt; + else + return toint(ans); +} + +// Like aintval, but result should be >= 0 +static int +auintval(Token* tok, int attid, int dflt) +{ + Rune* ans; + int v; + + if(!_tokaval(tok, attid, &ans, 0) || ans == nil) + return dflt; + else { + v = toint(ans); + return v >= 0? v : 0; + } +} + +// int conversion, but with possible error check (if warning) +static int +toint(Rune* s) +{ + int ans; + Rune* eptr; + + ans = _Strtol(s, &eptr, 10); + if(warn) { + if(*eptr != 0) { + eptr = _Strclass(eptr, notwhitespace); + if(eptr != nil) + fprint(2, "warning: expected integer, got %S\n", s); + } + } + return ans; +} + +// Attribute value when need a table to convert strings to ints +static int +atabval(Token* tok, int attid, StringInt* tab, int ntab, int dflt) +{ + Rune* aval; + int ans; + + ans = dflt; + if(_tokaval(tok, attid, &aval, 0)) { + if(!_lookup(tab, ntab, aval, _Strlen(aval), &ans)) { + ans = dflt; + if(warn) + fprint(2, "warning: name not found in table lookup: %S\n", aval); + } + } + return ans; +} + +// Attribute value when supposed to be a color +static int +acolorval(Token* tok, int attid, int dflt) +{ + Rune* aval; + int ans; + + ans = dflt; + if(_tokaval(tok, attid, &aval, 0)) + ans = color(aval, dflt); + return ans; +} + +// Attribute value when supposed to be a target frame name +static int +atargval(Token* tok, int dflt) +{ + int ans; + Rune* aval; + + ans = dflt; + if(_tokaval(tok, Atarget, &aval, 0)){ + ans = targetid(aval); + } + return ans; +} + +// special for list types, where "i" and "I" are different, +// but "square" and "SQUARE" are the same +static int +listtyval(Token* tok, int dflt) +{ + Rune* aval; + int ans; + int n; + + ans = dflt; + if(_tokaval(tok, Atype, &aval, 0)) { + n = _Strlen(aval); + if(n == 1) { + switch(aval[0]) { + case '1': + ans = LT1; + break; + case 'A': + ans = LTA; + break; + case 'I': + ans = LTI; + break; + case 'a': + ans = LTa; + break; + case 'i': + ans = LTi; + default: + if(warn) + fprint(2, "warning: unknown list element type %c\n", aval[0]); + } + } + else { + if(!_Strncmpci(aval, n, L(Lcircle))) + ans = LTcircle; + else if(!_Strncmpci(aval, n, L(Ldisc))) + ans = LTdisc; + else if(!_Strncmpci(aval, n, L(Lsquare))) + ans = LTsquare; + else { + if(warn) + fprint(2, "warning: unknown list element type %S\n", aval); + } + } + } + return ans; +} + +// Attribute value when value is a URL, possibly relative to base. +// FOR NOW: leave the url relative. +// Caller must free the result (eventually). +static Rune* +aurlval(Token* tok, int attid, Rune* dflt, Rune* base) +{ + Rune* ans; + Rune* url; + + USED(base); + ans = nil; + if(_tokaval(tok, attid, &url, 0) && url != nil) + ans = removeallwhite(url); + if(ans == nil) + ans = _Strdup(dflt); + return ans; +} + +// Return copy of s but with all whitespace (even internal) removed. +// This fixes some buggy URL specification strings. +static Rune* +removeallwhite(Rune* s) +{ + int j; + int n; + int i; + int c; + Rune* ans; + + j = 0; + n = _Strlen(s); + for(i = 0; i < n; i++) { + c = s[i]; + if(c >= 256 || !isspace(c)) + j++; + } + if(j < n) { + ans = _newstr(j); + j = 0; + for(i = 0; i < n; i++) { + c = s[i]; + if(c >= 256 || !isspace(c)) + ans[j++] = c; + } + ans[j] = 0; + } + else + ans = _Strdup(s); + return ans; +} + +// Attribute value when mere presence of attr implies value of 1, +// but if there is an integer there, return it as the value. +static int +aflagval(Token* tok, int attid) +{ + int val; + Rune* sval; + + val = 0; + if(_tokaval(tok, attid, &sval, 0)) { + val = 1; + if(sval != nil) + val = toint(sval); + } + return val; +} + +static Align +makealign(int halign, int valign) +{ + Align al; + + al.halign = halign; + al.valign = valign; + return al; +} + +// Make an Align (two alignments, horizontal and vertical) +static Align +aalign(Token* tok) +{ + return makealign( + atabval(tok, Aalign, align_tab, NALIGNTAB, ALnone), + atabval(tok, Avalign, align_tab, NALIGNTAB, ALnone)); +} + +// Make a Dimen, based on value of attid attr +static Dimen +adimen(Token* tok, int attid) +{ + Rune* wd; + + if(_tokaval(tok, attid, &wd, 0)) + return parsedim(wd, _Strlen(wd)); + else + return makedimen(Dnone, 0); +} + +// Parse s[0:n] as num[.[num]][unit][%|*] +static Dimen +parsedim(Rune* s, int ns) +{ + int kind; + int spec; + Rune* l; + int nl; + Rune* r; + int nr; + int mul; + int i; + Rune* f; + int nf; + int Tkdpi; + Rune* units; + + kind = Dnone; + spec = 0; + _splitl(s, ns, L(Lnot0to9), &l, &nl, &r, &nr); + if(nl != 0) { + spec = 1000*_Strtol(l, nil, 10); + if(nr > 0 && r[0] == '.') { + _splitl(r+1, nr-1, L(Lnot0to9), &f, &nf, &r, &nr); + if(nf != 0) { + mul = 100; + for(i = 0; i < nf; i++) { + spec = spec + mul*(f[i]-'0'); + mul = mul/10; + } + } + } + kind = Dpixels; + if(nr != 0) { + if(nr >= 2) { + Tkdpi = 100; + units = r; + r = r+2; + nr -= 2; + if(!_Strncmpci(units, 2, L(Lpt))) + spec = (spec*Tkdpi)/72; + else if(!_Strncmpci(units, 2, L(Lpi))) + spec = (spec*12*Tkdpi)/72; + else if(!_Strncmpci(units, 2, L(Lin))) + spec = spec*Tkdpi; + else if(!_Strncmpci(units, 2, L(Lcm))) + spec = (spec*100*Tkdpi)/254; + else if(!_Strncmpci(units, 2, L(Lmm))) + spec = (spec*10*Tkdpi)/254; + else if(!_Strncmpci(units, 2, L(Lem))) + spec = spec*15; + else { + if(warn) + fprint(2, "warning: unknown units %C%Cs\n", units[0], units[1]); + } + } + if(nr >= 1) { + if(r[0] == '%') + kind = Dpercent; + else if(r[0] == '*') + kind = Drelative; + } + } + spec = spec/1000; + } + else if(nr == 1 && r[0] == '*') { + spec = 1; + kind = Drelative; + } + return makedimen(kind, spec); +} + +static void +setdimarray(Token* tok, int attid, Dimen** pans, int* panslen) +{ + Rune* s; + Dimen* d; + int k; + int nc; + Rune* a[SMALLBUFSIZE]; + int an[SMALLBUFSIZE]; + + if(_tokaval(tok, attid, &s, 0)) { + nc = _splitall(s, _Strlen(s), L(Lcommaspace), a, an, SMALLBUFSIZE); + if(nc > 0) { + d = (Dimen*)emalloc(nc * sizeof(Dimen)); + for(k = 0; k < nc; k++) { + d[k] = parsedim(a[k], an[k]); + } + *pans = d; + *panslen = nc; + return; + } + } + *pans = nil; + *panslen = 0; +} + +static Background +makebackground(Rune* imageurl, int color) +{ + Background bg; + + bg.image = imageurl; + bg.color = color; + return bg; +} + +static Item* +newitext(Rune* s, int fnt, int fg, int voff, int ul) +{ + Itext* t; + + assert(s != nil); + t = (Itext*)emalloc(sizeof(Itext)); + t->item.tag = Itexttag; + t->s = s; + t->fnt = fnt; + t->fg = fg; + t->voff = voff; + t->ul = ul; + return (Item*)t; +} + +static Item* +newirule(int align, int size, int noshade, Dimen wspec) +{ + Irule* r; + + r = (Irule*)emalloc(sizeof(Irule)); + r->item.tag = Iruletag; + r->align = align; + r->size = size; + r->noshade = noshade; + r->wspec = wspec; + return (Item*)r; +} + +// Map is owned elsewhere. +static Item* +newiimage(Rune* src, Rune* altrep, int align, int width, int height, + int hspace, int vspace, int border, int ismap, Map* map) +{ + Iimage* i; + int state; + + state = 0; + if(ismap) + state = IFsmap; + i = (Iimage*)emalloc(sizeof(Iimage)); + i->item.tag = Iimagetag; + i->item.state = state; + i->imsrc = src; + i->altrep = altrep; + i->align = align; + i->imwidth = width; + i->imheight = height; + i->hspace = hspace; + i->vspace = vspace; + i->border = border; + i->map = map; + i->ctlid = -1; + return (Item*)i; +} + +static Item* +newiformfield(Formfield* ff) +{ + Iformfield* f; + + f = (Iformfield*)emalloc(sizeof(Iformfield)); + f->item.tag = Iformfieldtag; + f->formfield = ff; + return (Item*)f; +} + +static Item* +newitable(Table* tab) +{ + Itable* t; + + t = (Itable*)emalloc(sizeof(Itable)); + t->item.tag = Itabletag; + t->table = tab; + return (Item*)t; +} + +static Item* +newifloat(Item* it, int side) +{ + Ifloat* f; + + f = (Ifloat*)emalloc(sizeof(Ifloat)); + f->_item.tag = Ifloattag; + f->_item.state = IFwrap; + f->item = it; + f->side = side; + return (Item*)f; +} + +static Item* +newispacer(int spkind) +{ + Ispacer* s; + + s = (Ispacer*)emalloc(sizeof(Ispacer)); + s->item.tag = Ispacertag; + s->spkind = spkind; + return (Item*)s; +} + +// Free one item (caller must deal with next pointer) +static void +freeitem(Item* it) +{ + Iimage* ii; + Genattr* ga; + + if(it == nil) + return; + + switch(it->tag) { + case Itexttag: + free(((Itext*)it)->s); + break; + case Iimagetag: + ii = (Iimage*)it; + free(ii->imsrc); + free(ii->altrep); + break; + case Iformfieldtag: + freeformfield(((Iformfield*)it)->formfield); + break; + case Itabletag: + freetable(((Itable*)it)->table); + break; + case Ifloattag: + freeitem(((Ifloat*)it)->item); + break; + } + ga = it->genattr; + if(ga != nil) { + free(ga->id); + free(ga->class); + free(ga->style); + free(ga->title); + freescriptevents(ga->events); + } + free(it); +} + +// Free list of items chained through next pointer +void +freeitems(Item* ithead) +{ + Item* it; + Item* itnext; + + it = ithead; + while(it != nil) { + itnext = it->next; + freeitem(it); + it = itnext; + } +} + +static void +freeformfield(Formfield* ff) +{ + Option* o; + Option* onext; + + if(ff == nil) + return; + + free(ff->name); + free(ff->value); + for(o = ff->options; o != nil; o = onext) { + onext = o->next; + free(o->value); + free(o->display); + } + free(ff); +} + +static void +freetable(Table* t) +{ + int i; + Tablecell* c; + Tablecell* cnext; + + if(t == nil) + return; + + // We'll find all the unique cells via t->cells and next pointers. + // (Other pointers to cells in the table are duplicates of these) + for(c = t->cells; c != nil; c = cnext) { + cnext = c->next; + freeitems(c->content); + } + if(t->grid != nil) { + for(i = 0; i < t->nrow; i++) + free(t->grid[i]); + free(t->grid); + } + free(t->rows); + free(t->cols); + freeitems(t->caption); + free(t); +} + +static void +freeform(Form* f) +{ + if(f == nil) + return; + + free(f->name); + free(f->action); + // Form doesn't own its fields (Iformfield items do) + free(f); +} + +static void +freeforms(Form* fhead) +{ + Form* f; + Form* fnext; + + for(f = fhead; f != nil; f = fnext) { + fnext = f->next; + freeform(f); + } +} + +static void +freeanchor(Anchor* a) +{ + if(a == nil) + return; + + free(a->name); + free(a->href); + free(a); +} + +static void +freeanchors(Anchor* ahead) +{ + Anchor* a; + Anchor* anext; + + for(a = ahead; a != nil; a = anext) { + anext = a->next; + freeanchor(a); + } +} + +static void +freedestanchor(DestAnchor* da) +{ + if(da == nil) + return; + + free(da->name); + free(da); +} + +static void +freedestanchors(DestAnchor* dahead) +{ + DestAnchor* da; + DestAnchor* danext; + + for(da = dahead; da != nil; da = danext) { + danext = da->next; + freedestanchor(da); + } +} + +static void +freearea(Area* a) +{ + if(a == nil) + return; + free(a->href); + free(a->coords); +} + +static void freekidinfos(Kidinfo* khead); + +static void +freekidinfo(Kidinfo* k) +{ + if(k->isframeset) { + free(k->rows); + free(k->cols); + freekidinfos(k->kidinfos); + } + else { + free(k->src); + free(k->name); + } + free(k); +} + +static void +freekidinfos(Kidinfo* khead) +{ + Kidinfo* k; + Kidinfo* knext; + + for(k = khead; k != nil; k = knext) { + knext = k->next; + freekidinfo(k); + } +} + +static void +freemap(Map* m) +{ + Area* a; + Area* anext; + + if(m == nil) + return; + + free(m->name); + for(a = m->areas; a != nil; a = anext) { + anext = a->next; + freearea(a); + } + free(m); +} + +static void +freemaps(Map* mhead) +{ + Map* m; + Map* mnext; + + for(m = mhead; m != nil; m = mnext) { + mnext = m->next; + freemap(m); + } +} + +void +freedocinfo(Docinfo* d) +{ + if(d == nil) + return; + free(d->src); + free(d->base); + freeitem((Item*)d->backgrounditem); + free(d->refresh); + freekidinfos(d->kidinfo); + freeanchors(d->anchors); + freedestanchors(d->dests); + freeforms(d->forms); + freemaps(d->maps); + // tables, images, and formfields are freed when + // the items pointing at them are freed + free(d); +} + +// Currently, someone else owns all the memory +// pointed to by things in a Pstate. +static void +freepstate(Pstate* p) +{ + free(p); +} + +static void +freepstatestack(Pstate* pshead) +{ + Pstate* p; + Pstate* pnext; + + for(p = pshead; p != nil; p = pnext) { + pnext = p->next; + free(p); + } +} + +static int +Iconv(Fmt *f) +{ + Item* it; + Itext* t; + Irule* r; + Iimage* i; + Ifloat* fl; + int state; + Formfield* ff; + Rune* ty; + Tablecell* c; + Table* tab; + char* p; + int cl; + int hang; + int indent; + int bi; + int nbuf; + char buf[BIGBUFSIZE]; + + it = va_arg(f->args, Item*); + bi = 0; + nbuf = sizeof(buf); + state = it->state; + nbuf = nbuf-1; + if(state&IFbrk) { + cl = state&(IFcleft|IFcright); + p = ""; + if(cl) { + if(cl == (IFcleft|IFcright)) + p = " both"; + else if(cl == IFcleft) + p = " left"; + else + p = " right"; + } + bi = snprint(buf, nbuf, "brk(%d%s)", (state&IFbrksp)? 1 : 0, p); + } + if(state&IFnobrk) + bi += snprint(buf+bi, nbuf-bi, " nobrk"); + if(!(state&IFwrap)) + bi += snprint(buf+bi, nbuf-bi, " nowrap"); + if(state&IFrjust) + bi += snprint(buf+bi, nbuf-bi, " rjust"); + if(state&IFcjust) + bi += snprint(buf+bi, nbuf-bi, " cjust"); + if(state&IFsmap) + bi += snprint(buf+bi, nbuf-bi, " smap"); + indent = (state&IFindentmask) >> IFindentshift; + if(indent > 0) + bi += snprint(buf+bi, nbuf-bi, " indent=%d", indent); + hang = state&IFhangmask; + if(hang > 0) + bi += snprint(buf+bi, nbuf-bi, " hang=%d", hang); + + switch(it->tag) { + case Itexttag: + t = (Itext*)it; + bi += snprint(buf+bi, nbuf-bi, " Text '%S', fnt=%d, fg=%x", t->s, t->fnt, t->fg); + break; + + case Iruletag: + r = (Irule*)it; + bi += snprint(buf+bi, nbuf-bi, "Rule size=%d, al=%S, wspec=", r->size, stringalign(r->align)); + bi += dimprint(buf+bi, nbuf-bi, r->wspec); + break; + + case Iimagetag: + i = (Iimage*)it; + bi += snprint(buf+bi, nbuf-bi, + "Image src=%S, alt=%S, al=%S, w=%d, h=%d hsp=%d, vsp=%d, bd=%d, map=%S", + i->imsrc, i->altrep? i->altrep : L(Lempty), stringalign(i->align), i->imwidth, i->imheight, + i->hspace, i->vspace, i->border, i->map?i->map->name : L(Lempty)); + break; + + case Iformfieldtag: + ff = ((Iformfield*)it)->formfield; + if(ff->ftype == Ftextarea) + ty = L(Ltextarea); + else if(ff->ftype == Fselect) + ty = L(Lselect); + else { + ty = _revlookup(input_tab, NINPUTTAB, ff->ftype); + if(ty == nil) + ty = L(Lnone); + } + bi += snprint(buf+bi, nbuf-bi, "Formfield %S, fieldid=%d, formid=%d, name=%S, value=%S", + ty, ff->fieldid, ff->form->formid, ff->name? ff->name : L(Lempty), + ff->value? ff->value : L(Lempty)); + break; + + case Itabletag: + tab = ((Itable*)it)->table; + bi += snprint(buf+bi, nbuf-bi, "Table tableid=%d, width=", tab->tableid); + bi += dimprint(buf+bi, nbuf-bi, tab->width); + bi += snprint(buf+bi, nbuf-bi, ", nrow=%d, ncol=%d, ncell=%d, totw=%d, toth=%d\n", + tab->nrow, tab->ncol, tab->ncell, tab->totw, tab->toth); + for(c = tab->cells; c != nil; c = c->next) + bi += snprint(buf+bi, nbuf-bi, "Cell %d.%d, at (%d,%d) ", + tab->tableid, c->cellid, c->row, c->col); + bi += snprint(buf+bi, nbuf-bi, "End of Table %d", tab->tableid); + break; + + case Ifloattag: + fl = (Ifloat*)it; + bi += snprint(buf+bi, nbuf-bi, "Float, x=%d y=%d, side=%S, it=%I", + fl->x, fl->y, stringalign(fl->side), fl->item); + bi += snprint(buf+bi, nbuf-bi, "\n\t"); + break; + + case Ispacertag: + p = ""; + switch(((Ispacer*)it)->spkind) { + case ISPnull: + p = "null"; + break; + case ISPvline: + p = "vline"; + break; + case ISPhspace: + p = "hspace"; + break; + } + bi += snprint(buf+bi, nbuf-bi, "Spacer %s ", p); + break; + } + bi += snprint(buf+bi, nbuf-bi, " w=%d, h=%d, a=%d, anchor=%d\n", + it->width, it->height, it->ascent, it->anchorid); + buf[bi] = 0; + return fmtstrcpy(f, buf); +} + +// String version of alignment 'a' +static Rune* +stringalign(int a) +{ + Rune* s; + + s = _revlookup(align_tab, NALIGNTAB, a); + if(s == nil) + s = L(Lnone); + return s; +} + +// Put at most nbuf chars of representation of d into buf, +// and return number of characters put +static int +dimprint(char* buf, int nbuf, Dimen d) +{ + int n; + int k; + + n = 0; + n += snprint(buf, nbuf, "%d", dimenspec(d)); + k = dimenkind(d); + if(k == Dpercent) + buf[n++] = '%'; + if(k == Drelative) + buf[n++] = '*'; + return n; +} + +void +printitems(Item* items, char* msg) +{ + Item* il; + + fprint(2, "%s\n", msg); + il = items; + while(il != nil) { + fprint(2, "%I", il); + il = il->next; + } +} + +static Genattr* +newgenattr(Rune* id, Rune* class, Rune* style, Rune* title, SEvent* events) +{ + Genattr* g; + + g = (Genattr*)emalloc(sizeof(Genattr)); + g->id = id; + g->class = class; + g->style = style; + g->title = title; + g->events = events; + return g; +} + +static Formfield* +newformfield(int ftype, int fieldid, Form* form, Rune* name, + Rune* value, int size, int maxlength, Formfield* link) +{ + Formfield* ff; + + ff = (Formfield*)emalloc(sizeof(Formfield)); + ff->ftype = ftype; + ff->fieldid = fieldid; + ff->form = form; + ff->name = name; + ff->value = value; + ff->size = size; + ff->maxlength = maxlength; + ff->ctlid = -1; + ff->next = link; + return ff; +} + +// Transfers ownership of value and display to Option. +static Option* +newoption(int selected, Rune* value, Rune* display, Option* link) +{ + Option *o; + + o = (Option*)emalloc(sizeof(Option)); + o->selected = selected; + o->value = value; + o->display = display; + o->next = link; + return o; +} + +static Form* +newform(int formid, Rune* name, Rune* action, int target, int method, Form* link) +{ + Form* f; + + f = (Form*)emalloc(sizeof(Form)); + f->formid = formid; + f->name = name; + f->action = action; + f->target = target; + f->method = method; + f->nfields = 0; + f->fields = nil; + f->next = link; + return f; +} + +static Table* +newtable(int tableid, Align align, Dimen width, int border, + int cellspacing, int cellpadding, Background bg, Token* tok, Table* link) +{ + Table* t; + + t = (Table*)emalloc(sizeof(Table)); + t->tableid = tableid; + t->align = align; + t->width = width; + t->border = border; + t->cellspacing = cellspacing; + t->cellpadding = cellpadding; + t->background = bg; + t->caption_place = ALbottom; + t->caption_lay = nil; + t->tabletok = tok; + t->tabletok = nil; + t->next = link; + return t; +} + +static Tablerow* +newtablerow(Align align, Background bg, int flags, Tablerow* link) +{ + Tablerow* tr; + + tr = (Tablerow*)emalloc(sizeof(Tablerow)); + tr->align = align; + tr->background = bg; + tr->flags = flags; + tr->next = link; + return tr; +} + +static Tablecell* +newtablecell(int cellid, int rowspan, int colspan, Align align, Dimen wspec, int hspec, + Background bg, int flags, Tablecell* link) +{ + Tablecell* c; + + c = (Tablecell*)emalloc(sizeof(Tablecell)); + c->cellid = cellid; + c->lay = nil; + c->rowspan = rowspan; + c->colspan = colspan; + c->align = align; + c->flags = flags; + c->wspec = wspec; + c->hspec = hspec; + c->background = bg; + c->next = link; + return c; +} + +static Anchor* +newanchor(int index, Rune* name, Rune* href, int target, Anchor* link) +{ + Anchor* a; + + a = (Anchor*)emalloc(sizeof(Anchor)); + a->index = index; + a->name = name; + a->href = href; + a->target = target; + a->next = link; + return a; +} + +static DestAnchor* +newdestanchor(int index, Rune* name, Item* item, DestAnchor* link) +{ + DestAnchor* d; + + d = (DestAnchor*)emalloc(sizeof(DestAnchor)); + d->index = index; + d->name = name; + d->item = item; + d->next = link; + return d; +} + +static SEvent* +newscriptevent(int type, Rune* script, SEvent* link) +{ + SEvent* ans; + + ans = (SEvent*)emalloc(sizeof(SEvent)); + ans->type = type; + ans->script = script; + ans->next = link; + return ans; +} + +static void +freescriptevents(SEvent* ehead) +{ + SEvent* e; + SEvent* nexte; + + e = ehead; + while(e != nil) { + nexte = e->next; + free(e->script); + free(e); + e = nexte; + } +} + +static Dimen +makedimen(int kind, int spec) +{ + Dimen d; + + if(spec&Dkindmask) { + if(warn) + fprint(2, "warning: dimension spec too big: %d\n", spec); + spec = 0; + } + d.kindspec = kind|spec; + return d; +} + +int +dimenkind(Dimen d) +{ + return (d.kindspec&Dkindmask); +} + +int +dimenspec(Dimen d) +{ + return (d.kindspec&Dspecmask); +} + +static Kidinfo* +newkidinfo(int isframeset, Kidinfo* link) +{ + Kidinfo* ki; + + ki = (Kidinfo*)emalloc(sizeof(Kidinfo)); + ki->isframeset = isframeset; + if(!isframeset) { + ki->flags = FRhscrollauto|FRvscrollauto; + ki->marginw = FRKIDMARGIN; + ki->marginh = FRKIDMARGIN; + ki->framebd = 1; + } + ki->next = link; + return ki; +} + +static Docinfo* +newdocinfo(void) +{ + Docinfo* d; + + d = (Docinfo*)emalloc(sizeof(Docinfo)); + resetdocinfo(d); + return d; +} + +static void +resetdocinfo(Docinfo* d) +{ + memset(d, 0, sizeof(Docinfo)); + d->background = makebackground(nil, White); + d->text = Black; + d->link = Blue; + d->vlink = Blue; + d->alink = Blue; + d->target = FTself; + d->chset = ISO_8859_1; + d->scripttype = TextJavascript; + d->frameid = -1; +} + +// Use targetmap array to keep track of name <-> targetid mapping. +// Use real malloc(), and never free +static void +targetmapinit(void) +{ + targetmapsize = 10; + targetmap = (StringInt*)emalloc(targetmapsize*sizeof(StringInt)); + memset(targetmap, 0, targetmapsize*sizeof(StringInt)); + targetmap[0].key = _Strdup(L(L_top)); + targetmap[0].val = FTtop; + targetmap[1].key = _Strdup(L(L_self)); + targetmap[1].val = FTself; + targetmap[2].key = _Strdup(L(L_parent)); + targetmap[2].val = FTparent; + targetmap[3].key = _Strdup(L(L_blank)); + targetmap[3].val = FTblank; + ntargets = 4; +} + +int +targetid(Rune* s) +{ + int i; + int n; + + n = _Strlen(s); + if(n == 0) + return FTself; + for(i = 0; i < ntargets; i++) + if(_Strcmp(s, targetmap[i].key) == 0) + return targetmap[i].val; + if(i >= targetmapsize) { + targetmapsize += 10; + targetmap = (StringInt*)erealloc(targetmap, targetmapsize*sizeof(StringInt)); + } + targetmap[i].key = (Rune*)emalloc((n+1)*sizeof(Rune)); + memmove(targetmap[i].key, s, (n+1)*sizeof(Rune)); + targetmap[i].val = i; + ntargets++; + return i; +} + +Rune* +targetname(int targid) +{ + int i; + + for(i = 0; i < ntargets; i++) + if(targetmap[i].val == targid) + return targetmap[i].key; + return L(Lquestion); +} + +// Convert HTML color spec to RGB value, returning dflt if can't. +// Argument is supposed to be a valid HTML color, or "". +// Return the RGB value of the color, using dflt if s +// is nil or an invalid color. +static int +color(Rune* s, int dflt) +{ + int v; + Rune* rest; + + if(s == nil) + return dflt; + if(_lookup(color_tab, NCOLORS, s, _Strlen(s), &v)) + return v; + if(s[0] == '#') + s++; + v = _Strtol(s, &rest, 16); + if(*rest == 0) + return v; + return dflt; +} + +// Debugging + +#define HUGEPIX 10000 + +// A "shallow" validitem, that doesn't follow next links +// or descend into tables. +static int +validitem(Item* i) +{ + int ok; + Itext* ti; + Irule* ri; + Iimage* ii; + Ifloat* fi; + int a; + + ok = (i->tag >= Itexttag && i->tag <= Ispacertag) && + (i->next == nil || validptr(i->next)) && + (i->width >= 0 && i->width < HUGEPIX) && + (i->height >= 0 && i->height < HUGEPIX) && + (i->ascent > -HUGEPIX && i->ascent < HUGEPIX) && + (i->anchorid >= 0) && + (i->genattr == nil || validptr(i->genattr)); + // also, could check state for ridiculous combinations + // also, could check anchorid for within-doc-range + if(ok) + switch(i->tag) { + case Itexttag: + ti = (Itext*)i; + ok = validStr(ti->s) && + (ti->fnt >= 0 && ti->fnt < NumStyle*NumSize) && + (ti->ul == ULnone || ti->ul == ULunder || ti->ul == ULmid); + break; + case Iruletag: + ri = (Irule*)i; + ok = (validvalign(ri->align) || validhalign(ri->align)) && + (ri->size >=0 && ri->size < HUGEPIX); + break; + case Iimagetag: + ii = (Iimage*)i; + ok = (ii->imsrc == nil || validptr(ii->imsrc)) && + (ii->item.width >= 0 && ii->item.width < HUGEPIX) && + (ii->item.height >= 0 && ii->item.height < HUGEPIX) && + (ii->imwidth >= 0 && ii->imwidth < HUGEPIX) && + (ii->imheight >= 0 && ii->imheight < HUGEPIX) && + (ii->altrep == nil || validStr(ii->altrep)) && + (ii->map == nil || validptr(ii->map)) && + (validvalign(ii->align) || validhalign(ii->align)) && + (ii->nextimage == nil || validptr(ii->nextimage)); + break; + case Iformfieldtag: + ok = validformfield(((Iformfield*)i)->formfield); + break; + case Itabletag: + ok = validptr((Itable*)i); + break; + case Ifloattag: + fi = (Ifloat*)i; + ok = (fi->side == ALleft || fi->side == ALright) && + validitem(fi->item) && + (fi->item->tag == Iimagetag || fi->item->tag == Itabletag); + break; + case Ispacertag: + a = ((Ispacer*)i)->spkind; + ok = a==ISPnull || a==ISPvline || a==ISPhspace || a==ISPgeneral; + break; + default: + ok = 0; + } + return ok; +} + +// "deep" validation, that checks whole list of items, +// and descends into tables and floated tables. +// nil is ok for argument. +int +validitems(Item* i) +{ + int ok; + Item* ii; + + ok = 1; + while(i != nil && ok) { + ok = validitem(i); + if(ok) { + if(i->tag == Itabletag) { + ok = validtable(((Itable*)i)->table); + } + else if(i->tag == Ifloattag) { + ii = ((Ifloat*)i)->item; + if(ii->tag == Itabletag) + ok = validtable(((Itable*)ii)->table); + } + } + if(!ok) { + fprint(2, "invalid item: %I\n", i); + } + i = i->next; + } + return ok; +} + +static int +validformfield(Formfield* f) +{ + int ok; + + ok = (f->next == nil || validptr(f->next)) && + (f->ftype >= 0 && f->ftype <= Ftextarea) && + f->fieldid >= 0 && + (f->form == nil || validptr(f->form)) && + (f->name == nil || validStr(f->name)) && + (f->value == nil || validStr(f->value)) && + (f->options == nil || validptr(f->options)) && + (f->image == nil || validitem(f->image)) && + (f->events == nil || validptr(f->events)); + // when all built, should have f->fieldid < f->form->nfields, + // but this may be called during build... + return ok; +} + +// "deep" validation -- checks cell contents too +static int +validtable(Table* t) +{ + int ok; + int i, j; + Tablecell* c; + + ok = (t->next == nil || validptr(t->next)) && + t->nrow >= 0 && + t->ncol >= 0 && + t->ncell >= 0 && + validalign(t->align) && + validdimen(t->width) && + (t->border >= 0 && t->border < HUGEPIX) && + (t->cellspacing >= 0 && t->cellspacing < HUGEPIX) && + (t->cellpadding >= 0 && t->cellpadding < HUGEPIX) && + validitems(t->caption) && + (t->caption_place == ALtop || t->caption_place == ALbottom) && + (t->totw >= 0 && t->totw < HUGEPIX) && + (t->toth >= 0 && t->toth < HUGEPIX) && + (t->tabletok == nil || validptr(t->tabletok)); + // during parsing, t->rows has list; + // only when parsing is done is t->nrow set > 0 + if(ok && t->nrow > 0 && t->ncol > 0) { + // table is "finished" + for(i = 0; i < t->nrow && ok; i++) + ok = validtablerow(t->rows+i); + for(j = 0; j < t->ncol && ok; j++) + ok = validtablecol(t->cols+j); + for(c = t->cells; c != nil && ok; c = c->next) + ok = validtablecell(c); + for(i = 0; i < t->nrow && ok; i++) + for(j = 0; j < t->ncol && ok; j++) + ok = validptr(t->grid[i][j]); + } + return ok; +} + +static int +validvalign(int a) +{ + return a == ALnone || a == ALmiddle || a == ALbottom || a == ALtop || a == ALbaseline; +} + +static int +validhalign(int a) +{ + return a == ALnone || a == ALleft || a == ALcenter || a == ALright || + a == ALjustify || a == ALchar; +} + +static int +validalign(Align a) +{ + return validhalign(a.halign) && validvalign(a.valign); +} + +static int +validdimen(Dimen d) +{ + int ok; + int s; + + ok = 0; + s = d.kindspec&Dspecmask; + switch(d.kindspec&Dkindmask) { + case Dnone: + ok = s==0; + break; + case Dpixels: + ok = s < HUGEPIX; + break; + case Dpercent: + case Drelative: + ok = 1; + break; + } + return ok; +} + +static int +validtablerow(Tablerow* r) +{ + return (r->cells == nil || validptr(r->cells)) && + (r->height >= 0 && r->height < HUGEPIX) && + (r->ascent > -HUGEPIX && r->ascent < HUGEPIX) && + validalign(r->align); +} + +static int +validtablecol(Tablecol* c) +{ + return c->width >= 0 && c->width < HUGEPIX + && validalign(c->align); +} + +static int +validtablecell(Tablecell* c) +{ + int ok; + + ok = (c->next == nil || validptr(c->next)) && + (c->nextinrow == nil || validptr(c->nextinrow)) && + (c->content == nil || validptr(c->content)) && + (c->lay == nil || validptr(c->lay)) && + c->rowspan >= 0 && + c->colspan >= 0 && + validalign(c->align) && + validdimen(c->wspec) && + c->row >= 0 && + c->col >= 0; + if(ok) { + if(c->content != nil) + ok = validitems(c->content); + } + return ok; +} + +static int +validptr(void* p) +{ + // TODO: a better job of this. + // For now, just dereference, which cause a bomb + // if not valid + static char c; + + c = *((char*)p); + return 1; +} + +static int +validStr(Rune* s) +{ + return s != nil && validptr(s); +} diff --git a/src/libhtml/impl.h b/src/libhtml/impl.h new file mode 100644 index 00000000..f8c79ea3 --- /dev/null +++ b/src/libhtml/impl.h @@ -0,0 +1,163 @@ + +// UTILS +typedef struct List List; +typedef struct Strlist Strlist; + +// List of integers (and also generic list with next pointer at beginning) +struct List +{ + List* next; + int val; +}; + +struct Strlist +{ + Strlist* next; + Rune* val; +}; + +extern int _inclass(Rune c, Rune* cl); +extern int _listlen(List* l); +extern Rune* _ltoStr(int n); +extern List* _newlist(int val, List* rest); +extern Rune* _newstr(int n); +extern int _prefix(Rune* pre, Rune* s); +extern List* _revlist(List* l); +extern void _splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2); +extern void _splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2); +extern int _splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen); +extern Rune* _Stradd(Rune*s1, Rune* s2, int n); +extern Rune* _Strclass(Rune* s, Rune* cl); +extern int _Strcmp(Rune* s1, Rune* s2); +extern Rune* _Strdup(Rune* s); +extern Rune* _Strdup2(Rune* s, Rune* t); +extern int _Streqn(Rune* s1, int n1, Rune* s2); +extern int _Strlen(Rune* s); +extern Rune* _Strnclass(Rune* s, Rune* cl, int n); +extern int _Strncmpci(Rune* s1, int n1, Rune* s2); +extern Rune* _Strndup(Rune* s, int n); +extern Rune* _Strnrclass(Rune* s, Rune* cl, int n); +extern Rune* _Strrclass(Rune* s, Rune* cl); +extern Rune* _Strsubstr(Rune* s, int start, int stop); +extern long _Strtol(Rune* s, Rune** eptr, int base); +extern void _trimwhite(Rune* s, int n, Rune** pans, int* panslen); + +extern Rune notwhitespace[]; +extern Rune whitespace[]; + +// STRINTTAB +typedef struct StringInt StringInt; + +// Element of String-Int table (used for keyword lookup) +struct StringInt +{ + Rune* key; + int val; +}; + +extern int _lookup(StringInt* t, int n, Rune* key, int keylen, int* pans); +extern StringInt* _makestrinttab(Rune** a, int n); +extern Rune* _revlookup(StringInt* t, int n, int val); + +// Colors, in html format, not Plan 9 format. (RGB values in bottom 3 bytes) +enum { + White = 0xFFFFFF, + Black = 0x000000, + Blue = 0x0000CC, +}; + +// LEX + +// HTML 4.0 tags (plus blink, nobr) +// sorted in lexical order; used as array indices +enum { + Notfound, + Comment, + Ta, Tabbr, Tacronym, Taddress, Tapplet, Tarea, + Tb, Tbase, Tbasefont, Tbdo, Tbig, Tblink, + Tblockquote, Tbody, Tbq, Tbr, Tbutton, + Tcaption, Tcenter, Tcite, Tcode, Tcol, Tcolgroup, + Tdd, Tdel, Tdfn, Tdir, Tdiv, Tdl, Tdt, + Tem, + Tfieldset, Tfont, Tform, Tframe, Tframeset, + Th1, Th2, Th3, Th4, Th5, Th6, + Thead, Thr, Thtml, + Ti, Tiframe, Timg, Tinput, Tins, Tisindex, + Tkbd, + Tlabel, Tlegend, Tli, Tlink, + Tmap, Tmenu, Tmeta, + Tnobr, Tnoframes, Tnoscript, + Tobject, Tol, Toptgroup, Toption, + Tp, Tparam, Tpre, + Tq, + Ts, Tsamp, Tscript, Tselect, Tsmall, + Tspan, Tstrike, Tstrong, Tstyle, Tsub, Tsup, + Ttable, Ttbody, Ttd, Ttextarea, Ttfoot, + Tth, Tthead, Ttitle, Ttr, Ttt, + Tu, Tul, + Tvar, + Numtags, + RBRA = Numtags, + Data = Numtags+RBRA +}; + +// HTML 4.0 tag attributes +// Keep sorted in lexical order +enum { + Aabbr, Aaccept_charset, Aaccess_key, Aaction, + Aalign, Aalink, Aalt, Aarchive, Aaxis, + Abackground, Abgcolor, Aborder, + Acellpadding, Acellspacing, Achar, Acharoff, + Acharset, Achecked, Acite, Aclass, Aclassid, + Aclear, Acode, Acodebase, Acodetype, Acolor, + Acols, Acolspan, Acompact, Acontent, Acoords, + Adata, Adatetime, Adeclare, Adefer, Adir, Adisabled, + Aenctype, + Aface, Afor, Aframe, Aframeborder, + Aheaders, Aheight, Ahref, Ahreflang, Ahspace, Ahttp_equiv, + Aid, Aismap, + Alabel, Alang, Alink, Alongdesc, + Amarginheight, Amarginwidth, Amaxlength, + Amedia, Amethod, Amultiple, + Aname, Anohref, Anoresize, Anoshade, Anowrap, + Aobject, Aonblur, Aonchange, Aonclick, Aondblclick, + Aonfocus, Aonkeypress, Aonkeyup, Aonload, + Aonmousedown, Aonmousemove, Aonmouseout, + Aonmouseover, Aonmouseup, Aonreset, Aonselect, + Aonsubmit, Aonunload, + Aprofile, Aprompt, + Areadonly, Arel, Arev, Arows, Arowspan, Arules, + Ascheme, Ascope, Ascrolling, Aselected, Ashape, + Asize, Aspan, Asrc, Astandby, Astart, Astyle, Asummary, + Atabindex, Atarget, Atext, Atitle, Atype, + Ausemap, + Avalign, Avalue, Avaluetype, Aversion, Avlink, Avspace, + Awidth, + Numattrs +}; + +struct Attr +{ + Attr* next; // in list of attrs for a token + int attid; // Aabbr, etc. + Rune* value; +}; + +struct Token +{ + int tag; // Ta, etc + Rune* text; // text in Data, attribute text in tag + Attr* attr; // list of Attrs + int starti; // index into source buffer of token start +}; + +extern Rune** tagnames; +extern Rune** attrnames; + +extern void _freetokens(Token* tarray, int n); +extern Token* _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen); +extern int _tokaval(Token* t, int attid, Rune** pans, int xfer); + +#pragma varargck type "T" Token* + +#include "runetab.h" diff --git a/src/libhtml/lex.c b/src/libhtml/lex.c new file mode 100644 index 00000000..99c5fc12 --- /dev/null +++ b/src/libhtml/lex.c @@ -0,0 +1,1384 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <ctype.h> +#include <html.h> +#include "impl.h" + +typedef struct TokenSource TokenSource; +struct TokenSource +{ + int i; // index of next byte to use + uchar* data; // all the data + int edata; // data[0:edata] is valid + int chset; // one of US_Ascii, etc. + int mtype; // TextHtml or TextPlain +}; + +enum { + EOF = -2, + EOB = -1 +}; + +#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) + +#define SMALLBUFSIZE 240 +#define BIGBUFSIZE 2000 + +// HTML 4.0 tag names. +// Keep sorted, and in correspondence with enum in iparse.h. +Rune **tagnames; +char *_tagnames[] = { + " ", + "!", + "a", + "abbr", + "acronym", + "address", + "applet", + "area", + "b", + "base", + "basefont", + "bdo", + "big", + "blink", + "blockquote", + "body", + "bq", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "isindex", + "kbd", + "label", + "legend", + "li", + "link", + "map", + "menu", + "meta", + "nobr", + "noframes", + "noscript", + "object", + "ol", + "optgroup", + "option", + "p", + "param", + "pre", + "q", + "s", + "samp", + "script", + "select", + "small", + "span", + "strike", + "strong", + "style", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "title", + "tr", + "tt", + "u", + "ul", + "var" +}; + +// HTML 4.0 attribute names. +// Keep sorted, and in correspondence with enum in i.h. +Rune **attrnames; +char* _attrnames[] = { + "abbr", + "accept-charset", + "access-key", + "action", + "align", + "alink", + "alt", + "archive", + "axis", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "classid", + "clear", + "code", + "codebase", + "codetype", + "color", + "cols", + "colspan", + "compact", + "content", + "coords", + "data", + "datetime", + "declare", + "defer", + "dir", + "disabled", + "enctype", + "face", + "for", + "frame", + "frameborder", + "headers", + "height", + "href", + "hreflang", + "hspace", + "http-equiv", + "id", + "ismap", + "label", + "lang", + "link", + "longdesc", + "marginheight", + "marginwidth", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noresize", + "noshade", + "nowrap", + "object", + "onblur", + "onchange", + "onclick", + "ondblclick", + "onfocus", + "onkeypress", + "onkeyup", + "onload", + "onmousedown", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onreset", + "onselect", + "onsubmit", + "onunload", + "profile", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scheme", + "scope", + "scrolling", + "selected", + "shape", + "size", + "span", + "src", + "standby", + "start", + "style", + "summary", + "tabindex", + "target", + "text", + "title", + "type", + "usemap", + "valign", + "value", + "valuetype", + "version", + "vlink", + "vspace", + "width" +}; + + +// Character entity to unicode character number map. +// Keep sorted by name. +StringInt *chartab; +AsciiInt _chartab[142] = { + {"AElig", 198}, + {"Aacute", 193}, + {"Acirc", 194}, + {"Agrave", 192}, + {"Aring", 197}, + {"Atilde", 195}, + {"Auml", 196}, + {"Ccedil", 199}, + {"ETH", 208}, + {"Eacute", 201}, + {"Ecirc", 202}, + {"Egrave", 200}, + {"Euml", 203}, + {"Iacute", 205}, + {"Icirc", 206}, + {"Igrave", 204}, + {"Iuml", 207}, + {"Ntilde", 209}, + {"Oacute", 211}, + {"Ocirc", 212}, + {"Ograve", 210}, + {"Oslash", 216}, + {"Otilde", 213}, + {"Ouml", 214}, + {"THORN", 222}, + {"Uacute", 218}, + {"Ucirc", 219}, + {"Ugrave", 217}, + {"Uuml", 220}, + {"Yacute", 221}, + {"aacute", 225}, + {"acirc", 226}, + {"acute", 180}, + {"aelig", 230}, + {"agrave", 224}, + {"alpha", 945}, + {"amp", 38}, + {"aring", 229}, + {"atilde", 227}, + {"auml", 228}, + {"beta", 946}, + {"brvbar", 166}, + {"ccedil", 231}, + {"cdots", 8943}, + {"cedil", 184}, + {"cent", 162}, + {"chi", 967}, + {"copy", 169}, + {"curren", 164}, + {"ddots", 8945}, + {"deg", 176}, + {"delta", 948}, + {"divide", 247}, + {"eacute", 233}, + {"ecirc", 234}, + {"egrave", 232}, + {"emdash", 8212}, + {"emsp", 8195}, + {"endash", 8211}, + {"ensp", 8194}, + {"epsilon", 949}, + {"eta", 951}, + {"eth", 240}, + {"euml", 235}, + {"frac12", 189}, + {"frac14", 188}, + {"frac34", 190}, + {"gamma", 947}, + {"gt", 62}, + {"iacute", 237}, + {"icirc", 238}, + {"iexcl", 161}, + {"igrave", 236}, + {"iota", 953}, + {"iquest", 191}, + {"iuml", 239}, + {"kappa", 954}, + {"lambda", 955}, + {"laquo", 171}, + {"ldots", 8230}, + {"lt", 60}, + {"macr", 175}, + {"micro", 181}, + {"middot", 183}, + {"mu", 956}, + {"nbsp", 160}, + {"not", 172}, + {"ntilde", 241}, + {"nu", 957}, + {"oacute", 243}, + {"ocirc", 244}, + {"ograve", 242}, + {"omega", 969}, + {"omicron", 959}, + {"ordf", 170}, + {"ordm", 186}, + {"oslash", 248}, + {"otilde", 245}, + {"ouml", 246}, + {"para", 182}, + {"phi", 966}, + {"pi", 960}, + {"plusmn", 177}, + {"pound", 163}, + {"psi", 968}, + {"quad", 8193}, + {"quot", 34}, + {"raquo", 187}, + {"reg", 174}, + {"rho", 961}, + {"sect", 167}, + {"shy", 173}, + {"sigma", 963}, + {"sp", 8194}, + {"sup1", 185}, + {"sup2", 178}, + {"sup3", 179}, + {"szlig", 223}, + {"tau", 964}, + {"theta", 952}, + {"thinsp", 8201}, + {"thorn", 254}, + {"times", 215}, + {"trade", 8482}, + {"uacute", 250}, + {"ucirc", 251}, + {"ugrave", 249}, + {"uml", 168}, + {"upsilon", 965}, + {"uuml", 252}, + {"varepsilon", 8712}, + {"varphi", 981}, + {"varpi", 982}, + {"varrho", 1009}, + {"vdots", 8942}, + {"vsigma", 962}, + {"vtheta", 977}, + {"xi", 958}, + {"yacute", 253}, + {"yen", 165}, + {"yuml", 255}, + {"zeta", 950} +}; +#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0])) + +// Characters Winstart..Winend are those that Windows +// uses interpolated into the Latin1 set. +// They aren't supposed to appear in HTML, but they do.... +enum { + Winstart = 127, + Winend = 159 +}; + +static int winchars[]= { 8226, // 8226 is a bullet + 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 352, 8249, 338, 8226, 8226, 8226, + 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 353, 8250, 339, 8226, 8226, 376}; + +static StringInt* tagtable; // initialized from tagnames +static StringInt* attrtable; // initialized from attrnames + +static void lexinit(); +static int getplaindata(TokenSource* ts, Token* a, int* pai); +static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int gettag(TokenSource* ts, int starti, Token* a, int* pai); +static Rune* buftostr(Rune* s, Rune* buf, int j); +static int comment(TokenSource* ts); +static int findstr(TokenSource* ts, Rune* s); +static int ampersand(TokenSource* ts); +//static int lowerc(int c); +static int getchar(TokenSource* ts); +static void ungetchar(TokenSource* ts, int c); +static void backup(TokenSource* ts, int savei); +//static void freeinsidetoken(Token* t); +static void freeattrs(Attr* ahead); +static Attr* newattr(int attid, Rune* value, Attr* link); +static int Tconv(Fmt* f); + +int dbglex = 0; +static int lexinited = 0; + +static void +lexinit(void) +{ + chartab = cvtstringinttab(_chartab, nelem(_chartab)); + tagnames = cvtstringtab(_tagnames, nelem(_tagnames)); + tagtable = _makestrinttab(tagnames, Numtags); + attrnames = cvtstringtab(_attrnames, nelem(_attrnames)); + attrtable = _makestrinttab(attrnames, Numattrs); + fmtinstall('T', Tconv); + lexinited = 1; +} + +static TokenSource* +newtokensource(uchar* data, int edata, int chset, int mtype) +{ + TokenSource* ans; + + assert(chset == US_Ascii || chset == ISO_8859_1 || + chset == UTF_8 || chset == Unicode); + ans = (TokenSource*)emalloc(sizeof(TokenSource)); + ans->i = 0; + ans->data = data; + ans->edata = edata; + ans->chset = chset; + ans->mtype = mtype; + return ans; +} + +enum { + ToksChunk = 500 +}; + +// Call this to get the tokens. +// The number of returned tokens is returned in *plen. +Token* +_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) +{ + TokenSource* ts; + Token* a; + int alen; + int ai; + int starti; + int c; + int tag; + + if(!lexinited) + lexinit(); + ts = newtokensource(data, datalen, chset, mtype); + alen = ToksChunk; + a = (Token*)emalloc(alen * sizeof(Token)); + ai = 0; + if(dbglex) + fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); + if(ts->mtype == TextHtml) { + for(;;) { + if(ai == alen) { + a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); + alen += ToksChunk; + } + starti = ts->i; + c = getchar(ts); + if(c < 0) + break; + if(c == '<') { + tag = gettag(ts, starti, a, &ai); + if(tag == Tscript) { + // special rules for getting Data after.... + starti = ts->i; + c = getchar(ts); + tag = getscriptdata(ts, c, starti, a, &ai); + } + } + else + tag = getdata(ts, c, starti, a, &ai); + if(tag == -1) + break; + else if(dbglex > 1 && tag != Comment) + fprint(2, "lex: got token %T\n", &a[ai-1]); + } + } + else { + // plain text (non-html) tokens + for(;;) { + if(ai == alen) { + a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); + alen += ToksChunk; + } + tag = getplaindata(ts, a, &ai); + if(tag == -1) + break; + if(dbglex > 1) + fprint(2, "lex: got token %T\n", &a[ai]); + } + } + if(dbglex) + fprint(2, "lex: returning %d tokens\n", ai); + *plen = ai; + if(ai == 0) + return nil; + return a; +} + +// For case where source isn't HTML. +// Just make data tokens, one per line (or partial line, +// at end of buffer), ignoring non-whitespace control +// characters and dumping \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getplaindata(TokenSource* ts, Token* a, int* pai) +{ + Rune* s; + int j; + int starti; + int c; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + starti = ts->i; + for(c = getchar(ts); c >= 0; c = getchar(ts)) { + if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else + c = 0; + } + if(c != 0) { + buf[j++] = c; + if(j == sizeof(buf)-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + if(c == '\n') + break; + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// Return concatenation of s and buf[0:j] +static Rune* +buftostr(Rune* s, Rune* buf, int j) +{ + buf[j] = 0; + if(s == nil) + s = _Strndup(buf, j); + else + s = _Strdup2(s, buf); + return s; +} + +// Gather data up to next start-of-tag or end-of-buffer. +// Translate entity references (&). +// Ignore non-whitespace control characters and get rid of \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +{ + Rune* s; + int j; + int c; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + c = firstc; + while(c >= 0) { + if(c == '&') { + c = ampersand(ts); + if(c < 0) + break; + } + else if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else { + if(warn) + fprint(2, "warning: non-whitespace control character %d ignored\n", c); + c = 0; + } + } + else if(c == '<') { + ungetchar(ts, c); + break; + } + if(c != 0) { + buf[j++] = c; + if(j == BIGBUFSIZE-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + c = getchar(ts); + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// The rules for lexing scripts are different (ugh). +// Gather up everything until see a </SCRIPT>. +static int +getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +{ + Rune* s; + int j; + int tstarti; + int savei; + int c; + int tag; + int done; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + tstarti = starti; + c = firstc; + done = 0; + while(c >= 0) { + if(c == '<') { + // other browsers ignore stuff to end of line after <! + savei = ts->i; + c = getchar(ts); + if(c == '!') { + while(c >= 0 && c != '\n' && c != '\r') + c = getchar(ts); + if(c == '\r') + c = getchar(ts); + if(c == '\n') + c = getchar(ts); + } + else if(c >= 0) { + backup(ts, savei); + tag = gettag(ts, tstarti, a, pai); + if(tag == -1) + break; + if(tag != Comment) + (*pai)--; + backup(ts, tstarti); + if(tag == Tscript + RBRA) { + done = 1; + break; + } + // here tag was not </SCRIPT>, so take as regular data + c = getchar(ts); + } + } + if(c < 0) + break; + if(c != 0) { + buf[j++] = c; + if(j == BIGBUFSIZE-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + tstarti = ts->i; + c = getchar(ts); + } + if(done || ts->i == ts->edata) { + s = buftostr(s, buf, j); + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; + } + backup(ts, starti); + return -1; +} + +// We've just seen a '<'. Gather up stuff to closing '>' (if buffer +// ends before then, return -1). +// If it's a tag, look up the name, gather the attributes, and return +// the appropriate token. +// Else it's either just plain data or some kind of ignorable stuff: +// return Data or Comment as appropriate. +// If it's not a Comment, put it in a[*pai] and bump *pai. +static int +gettag(TokenSource* ts, int starti, Token* a, int* pai) +{ + int rbra; + int ans; + Attr* al; + int nexti; + int c; + int ti; + int afnd; + int attid; + int quote; + Rune* val; + int nv; + int i; + int tag; + Token* tok; + Rune buf[BIGBUFSIZE]; + + rbra = 0; + nexti = ts->i; + tok = &a[*pai]; + tok->tag = Notfound; + tok->text = nil; + tok->attr = nil; + tok->starti = starti; + c = getchar(ts); + if(c == '/') { + rbra = RBRA; + c = getchar(ts); + } + if(c < 0) + goto eob_done; + if(c >= 256 || !isalpha(c)) { + // not a tag + if(c == '!') { + ans = comment(ts); + if(ans != -1) + return ans; + goto eob_done; + } + else { + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L(Llt)); + (*pai)++; + return Data; + } + } + // c starts a tagname + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + // if name is bigger than buf it won't be found anyway... + if(i < BIGBUFSIZE) + buf[i++] = c; + } + if(_lookup(tagtable, Numtags, buf, i, &tag)) + tok->tag = tag + rbra; + else + tok->text = _Strndup(buf, i); // for warning print, in build + + // attribute gathering loop + al = nil; + while(1) { + // look for "ws name" or "ws name ws = ws val" (ws=whitespace) + // skip whitespace +attrloop_continue: + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c == '>') + goto attrloop_done; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, c); + goto attrloop_done; + } + if(c >= 256 || !isalpha(c)) { + if(warn) + fprint(2, "warning: expected attribute name\n"); + // skipt to next attribute name + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c < 256 && isalpha(c)) + goto attrloop_continue; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, 60); + goto attrloop_done; + } + if(c == '>') + goto attrloop_done; + } + } + // gather attribute name + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + if(i < BIGBUFSIZE-1) + buf[i++] = c; + } + afnd = _lookup(attrtable, Numattrs, buf, i, &attid); + if(warn && !afnd) { + buf[i] = 0; + fprint(2, "warning: unknown attribute name %S\n", buf); + } + // skip whitespace + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c != '=') { + if(afnd) + al = newattr(attid, nil, al); + goto attrloop_continue; + } + //# c is '=' here; skip whitespace + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c >= 256 || !isspace(c)) + break; + } + quote = 0; + if(c == '\'' || c == '"') { + quote = c; + c = getchar(ts); + if(c < 0) + goto eob_done; + } + val = nil; + nv = 0; + while(1) { +valloop_continue: + if(c < 0) + goto eob_done; + if(c == '>') { + if(quote) { + // c might be part of string (though not good style) + // but if line ends before close quote, assume + // there was an unmatched quote + ti = ts->i; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c == quote) { + backup(ts, ti); + buf[nv++] = '>'; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + goto valloop_continue; + } + if(c == '\n') { + if(warn) + fprint(2, "warning: apparent unmatched quote\n"); + backup(ts, ti); + c = '>'; + goto valloop_done; + } + } + } + else + goto valloop_done; + } + if(quote) { + if(c == quote) { + c = getchar(ts); + if(c < 0) + goto eob_done; + goto valloop_done; + } + if(c == '\r') { + c = getchar(ts); + goto valloop_continue; + } + if(c == '\t' || c == '\n') + c = ' '; + } + else { + if(c < 256 && isspace(c)) + goto valloop_done; + } + if(c == '&') { + c = ampersand(ts); + if(c == -1) + goto eob_done; + } + buf[nv++] = c; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + } +valloop_done: + if(afnd) { + val = buftostr(val, buf, nv); + al = newattr(attid, val, al); + } + } + +attrloop_done: + tok->attr = al; + (*pai)++; + return tok->tag; + +eob_done: + if(warn) + fprint(2, "warning: incomplete tag at end of page\n"); + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L(Llt)); + return Data; +} + +// We've just read a '<!' at position starti, +// so this may be a comment or other ignored section, or it may +// be just a literal string if there is no close before end of file +// (other browsers do that). +// The accepted practice seems to be (note: contrary to SGML spec!): +// If see <!--, look for --> to close, or if none, > to close. +// If see <!(not --), look for > to close. +// If no close before end of file, leave original characters in as literal data. +// +// If we see ignorable stuff, return Comment. +// Else return nil (caller should back up and try again when more data arrives, +// unless at end of file, in which case caller should just make '<' a data token). +static int +comment(TokenSource* ts) +{ + int nexti; + int havecomment; + int c; + + nexti = ts->i; + havecomment = 0; + c = getchar(ts); + if(c == '-') { + c = getchar(ts); + if(c == '-') { + if(findstr(ts, L(Larrow))) + havecomment = 1; + else + backup(ts, nexti); + } + } + if(!havecomment) { + if(c == '>') + havecomment = 1; + else if(c >= 0) { + if(findstr(ts, L(Lgt))) + havecomment = 1; + } + } + if(havecomment) + return Comment; + return -1; +} + +// Look for string s in token source. +// If found, return 1, with buffer at next char after s, +// else return 0 (caller should back up). +static int +findstr(TokenSource* ts, Rune* s) +{ + int c0; + int n; + int nexti; + int i; + int c; + + c0 = s[0]; + n = runestrlen(s); + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(c == c0) { + if(n == 1) + return 1; + nexti = ts->i; + for(i = 1; i < n; i++) { + c = getchar(ts); + if(c < 0) + goto mainloop_done; + if(c != s[i]) + break; + } + if(i == n) + return 1; + backup(ts, nexti); + } + } +mainloop_done: + return 0; +} + +// We've just read an '&'; look for an entity reference +// name, and if found, return translated char. +// if there is a complete entity name but it isn't known, +// try prefixes (gets around some buggy HTML out there), +// and if that fails, back up to just past the '&' and return '&'. +// If the entity can't be completed in the current buffer, back up +// to the '&' and return -1. +static int +ampersand(TokenSource* ts) +{ + int savei; + int c; + int fnd; + int ans; + int v; + int i; + int k; + Rune buf[SMALLBUFSIZE]; + + savei = ts->i; + c = getchar(ts); + fnd = 0; + ans = -1; + if(c == '#') { + c = getchar(ts); + v = 0; + while(c >= 0) { + if(!(c < 256 && isdigit(c))) + break; + v = v*10 + c - 48; + c = getchar(ts); + } + if(c >= 0) { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + c = v; + if(c == 160) + c = 160; + if(c >= Winstart && c <= Winend) { + c = winchars[c - Winstart]; + } + ans = c; + fnd = 1; + } + } + else if(c < 256 && isalpha(c)) { + buf[0] = c; + k = 1; + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(ISNAMCHAR(c)) { + if(k < SMALLBUFSIZE-1) + buf[k++] = c; + } + else { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + break; + } + } + if(c >= 0) { + fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); + if(!fnd) { + // Try prefixes of s + if(c == ';' || c == '\n' || c == '\r') + ungetchar(ts, c); + i = k; + while(--k > 0) { + fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); + if(fnd) { + while(i > k) { + i--; + ungetchar(ts, buf[i]); + } + break; + } + } + } + } + } + if(!fnd) { + backup(ts, savei); + ans = '&'; + } + return ans; +} + +// Get next char, obeying ts.chset. +// Returns -1 if no complete character left before current end of data. +static int +getchar(TokenSource* ts) +{ + uchar* buf; + int c; + int n; + int ok; + Rune r; + + if(ts->i >= ts->edata) + return -1; + buf = ts->data; + c = buf[ts->i]; + switch(ts->chset) { + case ISO_8859_1: + if(c >= Winstart && c <= Winend) + c = winchars[c - Winstart]; + ts->i++; + break; + case US_Ascii: + if(c > 127) { + if(warn) + fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); + } + ts->i++; + break; + case UTF_8: + ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); + n = chartorune(&r, (char*)(buf+ts->i)); + if(ok) { + if(warn && c == 0x80) + fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); + ts->i += n; + c = r; + } + else { + // not enough bytes in buf to complete utf-8 char + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + case Unicode: + if(ts->i < ts->edata - 1) { + //standards say most-significant byte first + c = (c << 8)|(buf[ts->i + 1]); + ts->i += 2; + } + else { + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + } + return c; +} + +// Assuming c was the last character returned by getchar, set +// things up so that next getchar will get that same character +// followed by the current 'next character', etc. +static void +ungetchar(TokenSource* ts, int c) +{ + int n; + Rune r; + char a[UTFmax]; + + n = 1; + switch(ts->chset) { + case UTF_8: + if(c >= 128) { + r = c; + n = runetochar(a, &r); + } + break; + case Unicode: + n = 2; + break; + } + ts->i -= n; +} + +// Restore ts so that it is at the state where the index was savei. +static void +backup(TokenSource* ts, int savei) +{ + if(dbglex) + fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); + ts->i = savei; +} + + +// Look for value associated with attribute attid in token t. +// If there is one, return 1 and put the value in *pans, +// else return 0. +// If xfer is true, transfer ownership of the string to the caller +// (nil it out here); otherwise, caller must duplicate the answer +// if it needs to save it. +// OK to have pans==0, in which case this is just looking +// to see if token is present. +int +_tokaval(Token* t, int attid, Rune** pans, int xfer) +{ + Attr* attr; + + attr = t->attr; + while(attr != nil) { + if(attr->attid == attid) { + if(pans != nil) + *pans = attr->value; + if(xfer) + attr->value = nil; + return 1; + } + attr = attr->next; + } + if(pans != nil) + *pans = nil; + return 0; +} + +static int +Tconv(Fmt *f) +{ + Token* t; + int i; + int tag; + char* srbra; + Rune* aname; + Rune* tname; + Attr* a; + char buf[BIGBUFSIZE]; + + t = va_arg(f->args, Token*); + if(t == nil) + sprint(buf, "<null>"); + else { + i = 0; + if(dbglex > 1) + i = snprint(buf, sizeof(buf), "[%d]", t->starti); + tag = t->tag; + if(tag == Data) { + i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); + } + else { + srbra = ""; + if(tag >= RBRA) { + tag -= RBRA; + srbra = "/"; + } + tname = tagnames[tag]; + if(tag == Notfound) + tname = L(Lquestion); + i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); + for(a = t->attr; a != nil; a = a->next) { + aname = attrnames[a->attid]; + i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); + if(a->value != nil) + i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); + } + i += snprint(buf+i, sizeof(buf)-i-1, ">"); + } + buf[i] = 0; + } + return fmtstrcpy(f, buf); +} + +// Attrs own their constituent strings, but build may eventually +// transfer some values to its items and nil them out in the Attr. +static Attr* +newattr(int attid, Rune* value, Attr* link) +{ + Attr* ans; + + ans = (Attr*)emalloc(sizeof(Attr)); + ans->attid = attid; + ans->value = value; + ans->next = link; + return ans; +} + +// Free list of Attrs linked through next field +static void +freeattrs(Attr* ahead) +{ + Attr* a; + Attr* nexta; + + a = ahead; + while(a != nil) { + nexta = a->next; + free(a->value); + free(a); + a = nexta; + } +} + +// Free array of Tokens. +// Allocated space might have room for more than n tokens, +// but only n of them are initialized. +// If caller has transferred ownership of constitutent strings +// or attributes, it must have nil'd out the pointers in the Tokens. +void +_freetokens(Token* tarray, int n) +{ + int i; + Token* t; + + if(tarray == nil) + return; + for(i = 0; i < n; i++) { + t = &tarray[i]; + free(t->text); + freeattrs(t->attr); + } + free(tarray); +} diff --git a/src/libhtml/mkfile b/src/libhtml/mkfile new file mode 100644 index 00000000..0952c451 --- /dev/null +++ b/src/libhtml/mkfile @@ -0,0 +1,22 @@ +<$SYS9/$systype/$objtype/mkfile + +LIB=$LIB9/libhtml.a + +OFILES=\ + build.$O\ + lex.$O\ + strinttab.$O\ + utils.$O\ + runetab.$O\ + +HFILES=\ + $SYS9/sys/include/html.h\ + impl.h\ + +UPDATE=\ + mkfile\ + $HFILES\ + ${OFILES:%.$O=%.c}\ + ${LIB:$SYS9/$systype/$objtype/%=$SYS9/$systype/386/%}\ + +<$SYS9/sys/src/cmd/mksyslib diff --git a/src/libhtml/runetab.c b/src/libhtml/runetab.c new file mode 100644 index 00000000..abd0a50f --- /dev/null +++ b/src/libhtml/runetab.c @@ -0,0 +1,83 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <html.h> +#include "impl.h" + +Rune **runeconsttab; +char *_runeconsttab[] = { + " ", + " ", + "", + "#", + "+", + ", ", + "-", + "-->", + "1", + "<", + ">", + "?", + "Index search terms:", + "Reset", + "Submit", + "^0-9", + "_ISINDEX_", + "_blank", + "_fr", + "_no_name_submit_", + "_parent", + "_self", + "_top", + "application/x-www-form-urlencoded", + "circle", + "cm", + "content-script-type", + "disc", + "em", + "in", + "javascript", + "jscript", + "jscript1.1", + "mm", + "none", + "pi", + "pt", + "refresh", + "select", + "square", + "textarea", +}; + +Rune** +cvtstringtab(char **tab, int n) +{ + int i; + Rune **rtab; + + rtab = emalloc(n*sizeof(rtab[0])); + for(i=0; i<n; i++) + rtab[i] = toStr(tab[i], strlen(tab[i]), US_Ascii); + return rtab; +} + +StringInt* +cvtstringinttab(AsciiInt *tab, int n) +{ + int i; + StringInt *stab; + + stab = emalloc(n*sizeof(stab[0])); + for(i=0; i<n; i++){ + stab[i].key = toStr(tab[i].key, strlen(tab[i].key), US_Ascii); + stab[i].val = tab[i].val; + } + return stab; +} + +void +runetabinit(void) +{ + runeconsttab = cvtstringtab(_runeconsttab, nelem(_runeconsttab)); + return; +} diff --git a/src/libhtml/runetab.h b/src/libhtml/runetab.h new file mode 100644 index 00000000..edde98c8 --- /dev/null +++ b/src/libhtml/runetab.h @@ -0,0 +1,59 @@ +typedef struct AsciiInt AsciiInt; + +struct AsciiInt { + char* key; + int val; +}; + +enum { + Ltab2space, + Lspace, + Lempty, + Lhash, + Lplus, + Lcommaspace, + Lminus, + Larrow, + Lone, + Llt, + Lgt, + Lquestion, + Lindex, + Lreset, + Lsubmit, + Lnot0to9, + Lisindex, + L_blank, + Lfr, + Lnoname, + L_parent, + L_self, + L_top, + Lappl_form, + Lcircle, + Lcm, + Lcontent, + Ldisc, + Lem, + Lin, + Ljavascript, + Ljscript, + Ljscript1, + Lmm, + Lnone, + Lpi, + Lpt, + Lrefresh, + Lselect, + Lsquare, + Ltextarea, +}; + +#define L(x) runeconsttab[(x)] + +extern Rune **runeconsttab; + +/* XXX: for unix port only */ +Rune **cvtstringtab(char**, int); +StringInt *cvtstringinttab(AsciiInt*, int); +void runetabinit(void); diff --git a/src/libhtml/strinttab.c b/src/libhtml/strinttab.c new file mode 100644 index 00000000..7883c044 --- /dev/null +++ b/src/libhtml/strinttab.c @@ -0,0 +1,64 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <html.h> +#include "impl.h" + +// Do case-insensitive lookup of key[0:keylen] in t[0:n] (key part), +// returning 1 if found, 0 if not. +// Array t must be sorted in increasing lexicographic order of key. +// If found, return corresponding val in *pans. +int +_lookup(StringInt* t, int n, Rune* key, int keylen, int* pans) +{ + int min; + int max; + int try; + int cmpresult; + + min = 0; + max = n - 1; + while(min <= max) { + try = (min + max)/2; + cmpresult = _Strncmpci(key, keylen, t[try].key); + if(cmpresult > 0) + min = try + 1; + else if(cmpresult < 0) + max = try - 1; + else { + *pans = t[try].val; + return 1; + } + } + return 0; +} + +// Return first key in t[0:n] that corresponds to val, +// nil if none. +Rune* +_revlookup(StringInt* t, int n, int val) +{ + int i; + + for(i = 0; i < n; i++) + if(t[i].val == val) + return t[i].key; + return nil; +} + +// Make a StringInt table out of a[0:n], mapping each string +// to its index. Check that entries are in alphabetical order. +StringInt* +_makestrinttab(Rune** a, int n) +{ + StringInt* ans; + int i; + + ans = (StringInt*)emalloc(n * sizeof(StringInt)); + for(i = 0; i < n; i++) { + ans[i].key = a[i]; + ans[i].val = i; + assert(i == 0 || runestrcmp(a[i], a[i - 1]) >= 0); + } + return ans; +} diff --git a/src/libhtml/utils.c b/src/libhtml/utils.c new file mode 100644 index 00000000..db22bba7 --- /dev/null +++ b/src/libhtml/utils.c @@ -0,0 +1,591 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <html.h> +#include "impl.h" + +Rune whitespace[] = { ' ', '\t', '\n', '\r', '\0' }; +Rune notwhitespace[] = { '^', ' ', '\t', '\n', '\r' , '\0'}; + +// All lists start out like List structure. +// List itself can be used as list of int. +int +_listlen(List* l) +{ + int n = 0; + + while(l != nil) { + l = l->next; + n++; + } + return n; +} + +// Cons +List* +_newlist(int val, List* rest) +{ + List* ans; + + ans = (List*)emalloc(sizeof(List)); + ans->val = val; + ans->next = rest; + return ans; +} + +// Reverse a list in place +List* +_revlist(List* l) +{ + List* newl; + List* nextl; + + newl = nil; + while(l != nil) { + nextl = l->next; + l->next = newl; + newl = l; + l = nextl; + } + return newl; +} + +// The next few routines take a "character class" as argument. +// e.g., "a-zA-Z", or "^ \t\n" +// (ranges indicated by - except in first position; +// ^ is first position means "not in" the following class) + +// Splitl splits s[0:n] just before first character of class cl. +// Answers go in (p1, n1) and (p2, n2). +// If no split, the whole thing goes in the first component. +// Note: answers contain pointers into original string. +void +_splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2) +{ + Rune* p; + + p = _Strnclass(s, cl, n); + *p1 = s; + if(p == nil) { + *n1 = n; + *p2 = nil; + *n2 = 0; + } + else { + *p2 = p; + *n1 = p-s; + *n2 = n-*n1; + } +} + +// Splitr splits s[0:n] just after last character of class cl. +// Answers go in (p1, n1) and (p2, n2). +// If no split, the whole thing goes in the last component. +// Note: answers contain pointers into original string. +void +_splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2) +{ + Rune* p; + + p = _Strnrclass(s, cl, n); + if(p == nil) { + *p1 = nil; + *n1 = 0; + *p2 = s; + *n2 = n; + } + else { + *p1 = s; + *p2 = p+1; + *n1 = *p2-s; + *n2 = n-*n1; + } +} + +// Splitall splits s[0:n] into parts that are separated by characters from class cl. +// Each part will have nonzero length. +// At most alen parts are found, and pointers to their starts go into +// the strarr array, while their lengths go into the lenarr array. +// The return value is the number of parts found. +int +_splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen) +{ + int i; + Rune* p; + Rune* q; + Rune* slast; + + if(s == nil || n == 0) + return 0; + i = 0; + p = s; + slast = s+n; + while(p < slast && i < alen) { + while(p < slast && _inclass(*p, cl)) + p++; + if(p == slast) + break; + q = _Strnclass(p, cl, slast-p); + if(q == nil) + q = slast; + assert(q > p && q <= slast); + strarr[i] = p; + lenarr[i] = q-p; + i++; + p = q; + } + return i; +} + +// Find part of s that excludes leading and trailing whitespace, +// and return that part in *pans (and its length in *panslen). +void +_trimwhite(Rune* s, int n, Rune** pans, int* panslen) +{ + Rune* p; + Rune* q; + + p = nil; + if(n > 0) { + p = _Strnclass(s, notwhitespace, n); + if(p != nil) { + q = _Strnrclass(s, notwhitespace, n); + assert(q != nil); + n = q+1-p; + } + } + *pans = p; + *panslen = n; +} + +// _Strclass returns a pointer to the first element of s that is +// a member of class cl, nil if none. +Rune* +_Strclass(Rune* s, Rune* cl) +{ + Rune* p; + + for(p = s; *p != 0; p++) + if(_inclass(*p, cl)) + return p; + return nil; +} + +// _Strnclass returns a pointer to the first element of s[0:n] that is +// a member of class cl, nil if none. +Rune* +_Strnclass(Rune* s, Rune* cl, int n) +{ + Rune* p; + + for(p = s; n-- && *p != 0; p++) + if(_inclass(*p, cl)) + return p; + return nil; +} + +// _Strrclass returns a pointer to the last element of s that is +// a member of class cl, nil if none +Rune* +_Strrclass(Rune* s, Rune* cl) +{ + Rune* p; + + if(s == nil || *s == 0) + return nil; + p = s + runestrlen(s) - 1; + while(p >= s) { + if(_inclass(*p, cl)) + return p; + p--; + }; + return nil; +} + +// _Strnrclass returns a pointer to the last element of s[0:n] that is +// a member of class cl, nil if none +Rune* +_Strnrclass(Rune* s, Rune* cl, int n) +{ + Rune* p; + + if(s == nil || *s == 0 || n == 0) + return nil; + p = s + n - 1; + while(p >= s) { + if(_inclass(*p, cl)) + return p; + p--; + }; + return nil; +} + +// Is c in the class cl? +int +_inclass(Rune c, Rune* cl) +{ + int n; + int ans; + int negate; + int i; + + n = _Strlen(cl); + if(n == 0) + return 0; + ans = 0; + negate = 0; + if(cl[0] == '^') { + negate = 1; + cl++; + n--; + } + for(i = 0; i < n; i++) { + if(cl[i] == '-' && i > 0 && i < n - 1) { + if(c >= cl[i - 1] && c <= cl[i + 1]) { + ans = 1; + break; + } + i++; + } + else if(c == cl[i]) { + ans = 1; + break; + } + } + if(negate) + ans = !ans; + return ans; +} + +// Is pre a prefix of s? +int +_prefix(Rune* pre, Rune* s) +{ + int ns; + int n; + int k; + + ns = _Strlen(s); + n = _Strlen(pre); + if(ns < n) + return 0; + for(k = 0; k < n; k++) { + if(pre[k] != s[k]) + return 0; + } + return 1; +} + +// Number of runes in (null-terminated) s +int +_Strlen(Rune* s) +{ + if(s == nil) + return 0; + return runestrlen(s); +} + +// -1, 0, 1 as s1 is lexicographically less, equal greater than s2 +int +_Strcmp(Rune *s1, Rune *s2) +{ + if(s1 == nil) + return (s2 == nil || *s2 == 0) ? 0 : -1; + if(s2 == nil) + return (*s1 == 0) ? 0 : 1; + return runestrcmp(s1, s2); +} + +// Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars). +// Also, do a case-insensitive match, assuming s2 +// has no chars in [A-Z], only their lowercase versions. +// (This routine is used for in-place keyword lookup, where s2 is in a keyword +// list and s1 is some substring, possibly mixed-case, in a buffer.) +int +_Strncmpci(Rune *s1, int n1, Rune *s2) +{ + Rune c1, c2; + + for(;;) { + if(n1-- == 0) { + if(*s2 == 0) + return 0; + return -1; + } + c1 = *s1++; + c2 = *s2++; + if(c1 >= 'A' && c1 <= 'Z') + c1 = c1 - 'A' + 'a'; + if(c1 != c2) { + if(c1 > c2) + return 1; + return -1; + } + } +} + +// emalloc and copy +Rune* +_Strdup(Rune* s) +{ + if(s == nil) + return nil; + return _Strndup(s, runestrlen(s)); +} + +// emalloc and copy n chars of s (assume s is at least that long), +// and add 0 terminator. +// Return nil if n==0. +Rune* +_Strndup(Rune* s, int n) +{ + Rune* ans; + + if(n <= 0) + return nil; + ans = _newstr(n); + memmove(ans, s, n*sizeof(Rune)); + ans[n] = 0; + return ans; +} +// emalloc enough room for n Runes, plus 1 null terminator. +// (Not initialized to anything.) +Rune* +_newstr(int n) +{ + return (Rune*)emalloc((n+1)*sizeof(Rune)); +} + +// emalloc and copy s+t +Rune* +_Strdup2(Rune* s, Rune* t) +{ + int ns, nt; + Rune* ans; + Rune* p; + + ns = _Strlen(s); + nt = _Strlen(t); + if(ns+nt == 0) + return nil; + ans = _newstr(ns+nt); + p = _Stradd(ans, s, ns); + p = _Stradd(p, t, nt); + *p = 0; + return ans; +} + +// Return emalloc'd substring s[start:stop], +Rune* +_Strsubstr(Rune* s, int start, int stop) +{ + Rune* t; + + if(start == stop) + return nil; + t = _Strndup(s+start, stop-start); + return t; +} + +// Copy n chars to s1 from s2, and return s1+n +Rune* +_Stradd(Rune* s1, Rune* s2, int n) +{ + if(n == 0) + return s1; + memmove(s1, s2, n*sizeof(Rune)); + return s1+n; +} + +// Like strtol, but converting from Rune* string + +//#define LONG_MAX 2147483647L +//#define LONG_MIN -2147483648L + +long +_Strtol(Rune* nptr, Rune** endptr, int base) +{ + Rune* p; + long n, nn; + int c, ovfl, v, neg, ndig; + + p = nptr; + neg = 0; + n = 0; + ndig = 0; + ovfl = 0; + + /* + * White space + */ + for(;;p++){ + switch(*p){ + case ' ': + case '\t': + case '\n': + case '\f': + case '\r': + case '\v': + continue; + } + break; + } + + /* + * Sign + */ + if(*p=='-' || *p=='+') + if(*p++ == '-') + neg = 1; + + /* + * Base + */ + if(base==0){ + if(*p != '0') + base = 10; + else{ + base = 8; + if(p[1]=='x' || p[1]=='X'){ + p += 2; + base = 16; + } + } + }else if(base==16 && *p=='0'){ + if(p[1]=='x' || p[1]=='X') + p += 2; + }else if(base<0 || 36<base) + goto Return; + + /* + * Non-empty sequence of digits + */ + for(;; p++,ndig++){ + c = *p; + v = base; + if('0'<=c && c<='9') + v = c - '0'; + else if('a'<=c && c<='z') + v = c - 'a' + 10; + else if('A'<=c && c<='Z') + v = c - 'A' + 10; + if(v >= base) + break; + nn = n*base + v; + if(nn < n) + ovfl = 1; + n = nn; + } + + Return: + if(ndig == 0) + p = nptr; + if(endptr) + *endptr = p; + if(ovfl){ + if(neg) + return LONG_MIN; + return LONG_MAX; + } + if(neg) + return -n; + return n; +} + +// Convert buf[0:n], bytes whose character set is chset, +// into a emalloc'd null-terminated Unicode string. +Rune* +toStr(uchar* buf, int n, int chset) +{ + int i; + int m; + Rune ch; + Rune* ans; + + switch(chset) { + case US_Ascii: + case ISO_8859_1: + ans = (Rune*)emalloc((n+1)*sizeof(Rune)); + for(i = 0; i < n; i++) + ans[i] = buf[i]; + ans[n] = 0; + break; + + case UTF_8: + m = 0; + for(i = 0; i < n; ) { + i += chartorune(&ch, (char*)(buf+i)); + m++; + } + ans = (Rune*)emalloc((m+1)*sizeof(Rune)); + m = 0; + for(i = 0; i < n; ) { + i += chartorune(&ch, (char*)(buf+i)); + ans[m++] = ch; + } + ans[m] = 0; + break; + + default: + ans = nil; + assert(0); + } + return ans; +} + +// Convert buf[0:n], Unicode characters, +// into an emalloc'd null-terminated string in character set chset. +// Use 0x80 for unconvertable characters. +uchar* +fromStr(Rune* buf, int n, int chset) +{ + uchar* ans; + int i, lim, m; + Rune ch; + uchar* p; + uchar s[UTFmax]; + + ans = nil; + switch(chset) { + case US_Ascii: + case ISO_8859_1: + ans = (uchar*)emalloc(n+1); + lim = (chset==US_Ascii)? 127 : 255; + for(i = 0; i < n; i++) { + ch = buf[i]; + if(ch > lim) + ch = 0x80; + ans[i] = ch; + } + ans[n] = 0; + break; + + case UTF_8: + m = 0; + for(i = 0; i < n; i++) { + m += runetochar((char*)s, &buf[i]); + } + ans = (uchar*)emalloc(m+1); + p = ans; + for(i = 0; i < n; i++) + p += runetochar((char*)p, &buf[i]); + *p = 0; + break; + + default: + assert(0); + } + return ans; + +} + +// Convert n to emalloc'd String. +Rune* +_ltoStr(int n) +{ + int m; + uchar buf[20]; + + m = snprint((char*)buf, sizeof(buf), "%d", n); + return toStr(buf, m, US_Ascii); +} |