diff options
author | wkj <devnull@localhost> | 2004-04-06 19:06:52 +0000 |
---|---|---|
committer | wkj <devnull@localhost> | 2004-04-06 19:06:52 +0000 |
commit | 7cf289ca89a7416999ae02330236042b0d37e3db (patch) | |
tree | 796d1363a7a53c72c28b199758ee674f1326a510 /src/cmd/htmlfmt | |
parent | 3e3817f7c86658f60715dd93768eaf8285807985 (diff) | |
download | plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.gz plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.bz2 plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.zip |
Import version of libhtml that might actually work with ANSI C.
Diffstat (limited to 'src/cmd/htmlfmt')
-rw-r--r-- | src/cmd/htmlfmt/dat.h | 50 | ||||
-rw-r--r-- | src/cmd/htmlfmt/html.c | 331 | ||||
-rw-r--r-- | src/cmd/htmlfmt/main.c | 71 | ||||
-rw-r--r-- | src/cmd/htmlfmt/mkfile | 30 | ||||
-rw-r--r-- | src/cmd/htmlfmt/util.c | 120 |
5 files changed, 602 insertions, 0 deletions
diff --git a/src/cmd/htmlfmt/dat.h b/src/cmd/htmlfmt/dat.h new file mode 100644 index 00000000..f3b05605 --- /dev/null +++ b/src/cmd/htmlfmt/dat.h @@ -0,0 +1,50 @@ +typedef struct Bytes Bytes; +typedef struct URLwin URLwin; + +enum +{ + STACK = 8192, + EVENTSIZE = 256, +}; + +struct Bytes +{ + uchar *b; + long n; + long nalloc; +}; + +struct URLwin +{ + int infd; + int outfd; + int type; + + char *url; + Item *items; + Docinfo *docinfo; +}; + +extern char* url; +extern int aflag; +extern int width; +extern int defcharset; + +extern char* loadhtml(int); + +extern char* readfile(char*, char*, int*); +extern int charset(char*); +extern void* emalloc(ulong); +extern char* estrdup(char*); +extern char* estrstrdup(char*, char*); +extern char* egrow(char*, char*, char*); +extern char* eappend(char*, char*, char*); +extern void error(char*, ...); + +extern void growbytes(Bytes*, char*, long); + +extern void rendertext(URLwin*, Bytes*); +extern void rerender(URLwin*); +extern void freeurlwin(URLwin*); + +#pragma varargck argpos error 1 diff --git a/src/cmd/htmlfmt/html.c b/src/cmd/htmlfmt/html.c new file mode 100644 index 00000000..4f2e436f --- /dev/null +++ b/src/cmd/htmlfmt/html.c @@ -0,0 +1,331 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <draw.h> +#include <regexp.h> +#include <html.h> +#include <ctype.h> +#include "dat.h" + +char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; +Reprog *urlprog; + +int inword = 0; +int col = 0; +int wordi = 0; + +char* +loadhtml(int fd) +{ + URLwin *u; + Bytes *b; + int n; + char buf[4096]; + + u = emalloc(sizeof(URLwin)); + u->infd = fd; + u->outfd = 1; + u->url = estrdup(url); + u->type = TextHtml; + + b = emalloc(sizeof(Bytes)); + while((n = read(fd, buf, sizeof buf)) > 0) + growbytes(b, buf, n); + if(b->b == nil) + return nil; /* empty file */ + rendertext(u, b); + freeurlwin(u); + return nil; +} + +char* +runetobyte(Rune *r, int n) +{ + char *s; + + if(n == 0) + return emalloc(1); + s = smprint("%.*S", n, r); + if(s == nil) + error("malloc failed"); + return s; +} + +int +closingpunct(int c) +{ + return strchr(".,:;'\")]}>!?", c) != nil; +} + +void +emitword(Bytes *b, Rune *r, int nr) +{ + char *s; + int space; + + if(nr == 0) + return; + s = smprint("%.*S", nr, r); + space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]); + if(col>0 && col+space+nr > width){ + growbytes(b, "\n", 1); + space = 0; + col = 0; + } + if(space && col>0){ + growbytes(b, " ", 1); + col++; + } + growbytes(b, s, strlen(s)); + col += nr; + free(s); + inword = 0; +} + +void +renderrunes(Bytes *b, Rune *r) +{ + int i, n; + + n = runestrlen(r); + for(i=0; i<n; i++){ + switch(r[i]){ + case '\n': + if(inword) + emitword(b, r+wordi, i-wordi); + col = 0; + if(b->n == 0) + break; /* don't start with blank lines */ + if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') + growbytes(b, "\n", 1); + break; + case ' ': + if(inword) + emitword(b, r+wordi, i-wordi); + break; + default: + if(!inword) + wordi = i; + inword = 1; + break; + } + } + if(inword) + emitword(b, r+wordi, i-wordi); +} + +void +renderbytes(Bytes *b, char *fmt, ...) +{ + Rune *r; + va_list arg; + + va_start(arg, fmt); + r = runevsmprint(fmt, arg); + va_end(arg); + renderrunes(b, r); + free(r); +} + +char* +baseurl(char *url) +{ + char *base, *slash; + Resub rs[10]; + + if(url == nil) + return nil; + if(urlprog == nil){ + urlprog = regcomp(urlexpr); + if(urlprog == nil) + error("can't compile URL regexp"); + } + memset(rs, 0, sizeof rs); + if(regexec(urlprog, url, rs, nelem(rs)) == 0) + return nil; + base = estrdup(url); + slash = strrchr(base, '/'); + if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p]) + *slash = '\0'; + else + base[rs[0].e.p-rs[0].s.p] = '\0'; + return base; +} + +char* +fullurl(URLwin *u, Rune *rhref) +{ + char *base, *href, *hrefbase; + char *result; + + if(rhref == nil) + return estrdup("NULL URL"); + href = runetobyte(rhref, runestrlen(rhref)); + hrefbase = baseurl(href); + result = nil; + if(hrefbase==nil && (base = baseurl(u->url))!=nil){ + result = estrdup(base); + if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) + result = eappend(result, "/", ""); + free(base); + } + if(href){ + if(result) + result = eappend(result, "", href); + else + result = estrdup(href); + } + free(hrefbase); + if(result == nil) + return estrdup("***unknown***"); + return result; +} + +void +render(URLwin *u, Bytes *t, Item *items, int curanchor) +{ + Item *il; + Itext *it; + Ifloat *ifl; + Ispacer *is; + Itable *ita; + Iimage *im; + Anchor *a; + Table *tab; + Tablecell *cell; + char *href; + + inword = 0; + col = 0; + wordi = 0; + + for(il=items; il!=nil; il=il->next){ + if(il->state & IFbrk) + renderbytes(t, "\n"); + if(il->state & IFbrksp) + renderbytes(t, "\n"); + + switch(il->tag){ + case Itexttag: + it = (Itext*)il; + renderrunes(t, it->s); + break; + case Iruletag: + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); + renderbytes(t, "=======\n"); + break; + case Iimagetag: + if(!aflag) + break; + im = (Iimage*)il; + if(im->imsrc){ + href = fullurl(u, im->imsrc); + renderbytes(t, "[image %s]", href); + free(href); + } + break; + case Iformfieldtag: + if(aflag) + renderbytes(t, "[formfield]"); + break; + case Itabletag: + ita = (Itable*)il; + tab = ita->table; + for(cell=tab->cells; cell!=nil; cell=cell->next){ + render(u, t, cell->content, curanchor); + } + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); + break; + case Ifloattag: + ifl = (Ifloat*)il; + render(u, t, ifl->item, curanchor); + break; + case Ispacertag: + is = (Ispacer*)il; + if(is->spkind != ISPnull) + renderbytes(t, " "); + break; + default: + error("unknown item tag %d\n", il->tag); + } + if(il->anchorid != 0 && il->anchorid!=curanchor){ + for(a=u->docinfo->anchors; a!=nil; a=a->next) + if(aflag && a->index == il->anchorid){ + href = fullurl(u, a->href); + renderbytes(t, "[%s]", href); + free(href); + break; + } + curanchor = il->anchorid; + } + } + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); +} + +void +rerender(URLwin *u) +{ + Bytes *t; + + t = emalloc(sizeof(Bytes)); + + render(u, t, u->items, 0); + + if(t->n) + write(u->outfd, (char*)t->b, t->n); + free(t->b); + free(t); +} + +/* + * Somewhat of a hack. Not a full parse, just looks for strings in the beginning + * of the document (cistrstr only looks at first somewhat bytes). + */ +int +charset(char *s) +{ + char *meta, *emeta, *charset; + + if(defcharset == 0) + defcharset = ISO_8859_1; + meta = cistrstr(s, "<meta"); + if(meta == nil) + return defcharset; + for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) + ; + charset = cistrstr(s, "charset="); + if(charset == nil) + return defcharset; + charset += 8; + if(*charset == '"') + charset++; + if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) + return UTF_8; + return defcharset; +} + +void +rendertext(URLwin *u, Bytes *b) +{ + Rune *rurl; + + rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); + u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); +// free(rurl); + + rerender(u); +} + + +void +freeurlwin(URLwin *u) +{ + freeitems(u->items); + u->items = nil; + freedocinfo(u->docinfo); + u->docinfo = nil; + free(u); +} diff --git a/src/cmd/htmlfmt/main.c b/src/cmd/htmlfmt/main.c new file mode 100644 index 00000000..f85bbb48 --- /dev/null +++ b/src/cmd/htmlfmt/main.c @@ -0,0 +1,71 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <draw.h> +#include <html.h> +#include "dat.h" + +char *url = ""; +int aflag; +int width = 70; +int defcharset; + +void +usage(void) +{ + fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n"); + exits("usage"); +} + +void +main(int argc, char *argv[]) +{ + int i, fd; + char *p, *err, *file; + char errbuf[ERRMAX]; + + ARGBEGIN{ + case 'a': + aflag++; + break; + case 'c': + p = smprint("<meta charset=\"%s\">", EARGF(usage())); + defcharset = charset(p); + free(p); + break; + case 'l': case 'w': + err = EARGF(usage()); + width = atoi(err); + if(width <= 0) + usage(); + break; + case 'u': + url = EARGF(usage()); + aflag++; + break; + default: + usage(); + }ARGEND + + err = nil; + file = "<stdin>"; + if(argc == 0) + err = loadhtml(0); + else + for(i=0; err==nil && i<argc; i++){ + file = argv[i]; + fd = open(file, OREAD); + if(fd < 0){ + errstr(errbuf, sizeof errbuf); + err = errbuf; + break; + } + err = loadhtml(fd); + close(fd); + if(err) + break; + } + if(err) + fprint(2, "htmlfmt: processing %s: %s\n", file, err); + exits(err); +} diff --git a/src/cmd/htmlfmt/mkfile b/src/cmd/htmlfmt/mkfile new file mode 100644 index 00000000..5b263532 --- /dev/null +++ b/src/cmd/htmlfmt/mkfile @@ -0,0 +1,30 @@ +<$SYS9/$systype/$objtype/mkfile + +TARG=htmlfmt +OFILES=\ + main.$O\ + html.$O\ + util.$O\ + +HFILES=\ + dat.h\ + $SYS9/sys/include/html.h\ + +LIB=$SYS9/$systype/$objtype/lib/libbio.a\ + $SYS9/$systype/$objtype/lib/libregexp.a\ + $SYS9/$systype/$objtype/lib/libhtml.a\ + $SYS9/$systype/$objtype/lib/lib9c.a + +BIN=$SYS9/$systype/$objtype/bin + +UPDATE=\ + mkfile\ + $HFILES\ + ${OFILES:%.$O=%.c} + +<$SYS9/sys/src/cmd/mkone + +CFLAGS=$CFLAGS + +#$O.out: $OFILES +# $LD -o $target $LDFLAGS $OFILES diff --git a/src/cmd/htmlfmt/util.c b/src/cmd/htmlfmt/util.c new file mode 100644 index 00000000..b22b0ab5 --- /dev/null +++ b/src/cmd/htmlfmt/util.c @@ -0,0 +1,120 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <draw.h> +#include <html.h> +#include "dat.h" + +void* +emalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil) + error("can't malloc: %r"); + memset(p, 0, n); + return p; +} + +void* +erealloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == nil) + error("can't malloc: %r"); + return p; +} + +char* +estrdup(char *s) +{ + char *t; + + t = emalloc(strlen(s)+1); + strcpy(t, s); + return t; +} + +char* +estrstrdup(char *s, char *t) +{ + long ns, nt; + char *u; + + ns = strlen(s); + nt = strlen(t); + /* use malloc to avoid memset */ + u = malloc(ns+nt+1); + if(u == nil) + error("can't malloc: %r"); + memmove(u, s, ns); + memmove(u+ns, t, nt); + u[ns+nt] = '\0'; + return u; +} + +char* +eappend(char *s, char *sep, char *t) +{ + long ns, nsep, nt; + char *u; + + if(t == nil) + u = estrstrdup(s, sep); + else{ + ns = strlen(s); + nsep = strlen(sep); + nt = strlen(t); + /* use malloc to avoid memset */ + u = malloc(ns+nsep+nt+1); + if(u == nil) + error("can't malloc: %r"); + memmove(u, s, ns); + memmove(u+ns, sep, nsep); + memmove(u+ns+nsep, t, nt); + u[ns+nsep+nt] = '\0'; + } + free(s); + return u; +} + +char* +egrow(char *s, char *sep, char *t) +{ + s = eappend(s, sep, t); + free(t); + return s; +} + +void +error(char *fmt, ...) +{ + va_list arg; + char buf[256]; + Fmt f; + + fmtfdinit(&f, 2, buf, sizeof buf); + fmtprint(&f, "Mail: "); + va_start(arg, fmt); + fmtvprint(&f, fmt, arg); + va_end(arg); + fmtprint(&f, "\n"); + fmtfdflush(&f); + exits(fmt); +} + +void +growbytes(Bytes *b, char *s, long ns) +{ + if(b->nalloc < b->n + ns + 1){ + b->nalloc = b->n + ns + 8000; + /* use realloc to avoid memset */ + b->b = realloc(b->b, b->nalloc); + if(b->b == nil) + error("growbytes: can't realloc: %r"); + } + memmove(b->b+b->n, s, ns); + b->n += ns; + b->b[b->n] = '\0'; +} |