From 7cf289ca89a7416999ae02330236042b0d37e3db Mon Sep 17 00:00:00 2001 From: wkj Date: Tue, 6 Apr 2004 19:06:52 +0000 Subject: Import version of libhtml that might actually work with ANSI C. --- src/cmd/htmlfmt/dat.h | 50 ++++++++ src/cmd/htmlfmt/html.c | 331 +++++++++++++++++++++++++++++++++++++++++++++++++ src/cmd/htmlfmt/main.c | 71 +++++++++++ src/cmd/htmlfmt/mkfile | 30 +++++ src/cmd/htmlfmt/util.c | 120 ++++++++++++++++++ 5 files changed, 602 insertions(+) create mode 100644 src/cmd/htmlfmt/dat.h create mode 100644 src/cmd/htmlfmt/html.c create mode 100644 src/cmd/htmlfmt/main.c create mode 100644 src/cmd/htmlfmt/mkfile create mode 100644 src/cmd/htmlfmt/util.c (limited to 'src/cmd/htmlfmt') diff --git a/src/cmd/htmlfmt/dat.h b/src/cmd/htmlfmt/dat.h new file mode 100644 index 00000000..f3b05605 --- /dev/null +++ b/src/cmd/htmlfmt/dat.h @@ -0,0 +1,50 @@ +typedef struct Bytes Bytes; +typedef struct URLwin URLwin; + +enum +{ + STACK = 8192, + EVENTSIZE = 256, +}; + +struct Bytes +{ + uchar *b; + long n; + long nalloc; +}; + +struct URLwin +{ + int infd; + int outfd; + int type; + + char *url; + Item *items; + Docinfo *docinfo; +}; + +extern char* url; +extern int aflag; +extern int width; +extern int defcharset; + +extern char* loadhtml(int); + +extern char* readfile(char*, char*, int*); +extern int charset(char*); +extern void* emalloc(ulong); +extern char* estrdup(char*); +extern char* estrstrdup(char*, char*); +extern char* egrow(char*, char*, char*); +extern char* eappend(char*, char*, char*); +extern void error(char*, ...); + +extern void growbytes(Bytes*, char*, long); + +extern void rendertext(URLwin*, Bytes*); +extern void rerender(URLwin*); +extern void freeurlwin(URLwin*); + +#pragma varargck argpos error 1 diff --git a/src/cmd/htmlfmt/html.c b/src/cmd/htmlfmt/html.c new file mode 100644 index 00000000..4f2e436f --- /dev/null +++ b/src/cmd/htmlfmt/html.c @@ -0,0 +1,331 @@ +#include +#include +#include +#include +#include +#include +#include +#include "dat.h" + +char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; +Reprog *urlprog; + +int inword = 0; +int col = 0; +int wordi = 0; + +char* +loadhtml(int fd) +{ + URLwin *u; + Bytes *b; + int n; + char buf[4096]; + + u = emalloc(sizeof(URLwin)); + u->infd = fd; + u->outfd = 1; + u->url = estrdup(url); + u->type = TextHtml; + + b = emalloc(sizeof(Bytes)); + while((n = read(fd, buf, sizeof buf)) > 0) + growbytes(b, buf, n); + if(b->b == nil) + return nil; /* empty file */ + rendertext(u, b); + freeurlwin(u); + return nil; +} + +char* +runetobyte(Rune *r, int n) +{ + char *s; + + if(n == 0) + return emalloc(1); + s = smprint("%.*S", n, r); + if(s == nil) + error("malloc failed"); + return s; +} + +int +closingpunct(int c) +{ + return strchr(".,:;'\")]}>!?", c) != nil; +} + +void +emitword(Bytes *b, Rune *r, int nr) +{ + char *s; + int space; + + if(nr == 0) + return; + s = smprint("%.*S", nr, r); + space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]); + if(col>0 && col+space+nr > width){ + growbytes(b, "\n", 1); + space = 0; + col = 0; + } + if(space && col>0){ + growbytes(b, " ", 1); + col++; + } + growbytes(b, s, strlen(s)); + col += nr; + free(s); + inword = 0; +} + +void +renderrunes(Bytes *b, Rune *r) +{ + int i, n; + + n = runestrlen(r); + for(i=0; in == 0) + break; /* don't start with blank lines */ + if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') + growbytes(b, "\n", 1); + break; + case ' ': + if(inword) + emitword(b, r+wordi, i-wordi); + break; + default: + if(!inword) + wordi = i; + inword = 1; + break; + } + } + if(inword) + emitword(b, r+wordi, i-wordi); +} + +void +renderbytes(Bytes *b, char *fmt, ...) +{ + Rune *r; + va_list arg; + + va_start(arg, fmt); + r = runevsmprint(fmt, arg); + va_end(arg); + renderrunes(b, r); + free(r); +} + +char* +baseurl(char *url) +{ + char *base, *slash; + Resub rs[10]; + + if(url == nil) + return nil; + if(urlprog == nil){ + urlprog = regcomp(urlexpr); + if(urlprog == nil) + error("can't compile URL regexp"); + } + memset(rs, 0, sizeof rs); + if(regexec(urlprog, url, rs, nelem(rs)) == 0) + return nil; + base = estrdup(url); + slash = strrchr(base, '/'); + if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p]) + *slash = '\0'; + else + base[rs[0].e.p-rs[0].s.p] = '\0'; + return base; +} + +char* +fullurl(URLwin *u, Rune *rhref) +{ + char *base, *href, *hrefbase; + char *result; + + if(rhref == nil) + return estrdup("NULL URL"); + href = runetobyte(rhref, runestrlen(rhref)); + hrefbase = baseurl(href); + result = nil; + if(hrefbase==nil && (base = baseurl(u->url))!=nil){ + result = estrdup(base); + if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) + result = eappend(result, "/", ""); + free(base); + } + if(href){ + if(result) + result = eappend(result, "", href); + else + result = estrdup(href); + } + free(hrefbase); + if(result == nil) + return estrdup("***unknown***"); + return result; +} + +void +render(URLwin *u, Bytes *t, Item *items, int curanchor) +{ + Item *il; + Itext *it; + Ifloat *ifl; + Ispacer *is; + Itable *ita; + Iimage *im; + Anchor *a; + Table *tab; + Tablecell *cell; + char *href; + + inword = 0; + col = 0; + wordi = 0; + + for(il=items; il!=nil; il=il->next){ + if(il->state & IFbrk) + renderbytes(t, "\n"); + if(il->state & IFbrksp) + renderbytes(t, "\n"); + + switch(il->tag){ + case Itexttag: + it = (Itext*)il; + renderrunes(t, it->s); + break; + case Iruletag: + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); + renderbytes(t, "=======\n"); + break; + case Iimagetag: + if(!aflag) + break; + im = (Iimage*)il; + if(im->imsrc){ + href = fullurl(u, im->imsrc); + renderbytes(t, "[image %s]", href); + free(href); + } + break; + case Iformfieldtag: + if(aflag) + renderbytes(t, "[formfield]"); + break; + case Itabletag: + ita = (Itable*)il; + tab = ita->table; + for(cell=tab->cells; cell!=nil; cell=cell->next){ + render(u, t, cell->content, curanchor); + } + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); + break; + case Ifloattag: + ifl = (Ifloat*)il; + render(u, t, ifl->item, curanchor); + break; + case Ispacertag: + is = (Ispacer*)il; + if(is->spkind != ISPnull) + renderbytes(t, " "); + break; + default: + error("unknown item tag %d\n", il->tag); + } + if(il->anchorid != 0 && il->anchorid!=curanchor){ + for(a=u->docinfo->anchors; a!=nil; a=a->next) + if(aflag && a->index == il->anchorid){ + href = fullurl(u, a->href); + renderbytes(t, "[%s]", href); + free(href); + break; + } + curanchor = il->anchorid; + } + } + if(t->n>0 && t->b[t->n-1]!='\n') + renderbytes(t, "\n"); +} + +void +rerender(URLwin *u) +{ + Bytes *t; + + t = emalloc(sizeof(Bytes)); + + render(u, t, u->items, 0); + + if(t->n) + write(u->outfd, (char*)t->b, t->n); + free(t->b); + free(t); +} + +/* + * Somewhat of a hack. Not a full parse, just looks for strings in the beginning + * of the document (cistrstr only looks at first somewhat bytes). + */ +int +charset(char *s) +{ + char *meta, *emeta, *charset; + + if(defcharset == 0) + defcharset = ISO_8859_1; + meta = cistrstr(s, "url, strlen(u->url), ISO_8859_1); + u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); +// free(rurl); + + rerender(u); +} + + +void +freeurlwin(URLwin *u) +{ + freeitems(u->items); + u->items = nil; + freedocinfo(u->docinfo); + u->docinfo = nil; + free(u); +} diff --git a/src/cmd/htmlfmt/main.c b/src/cmd/htmlfmt/main.c new file mode 100644 index 00000000..f85bbb48 --- /dev/null +++ b/src/cmd/htmlfmt/main.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include +#include "dat.h" + +char *url = ""; +int aflag; +int width = 70; +int defcharset; + +void +usage(void) +{ + fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n"); + exits("usage"); +} + +void +main(int argc, char *argv[]) +{ + int i, fd; + char *p, *err, *file; + char errbuf[ERRMAX]; + + ARGBEGIN{ + case 'a': + aflag++; + break; + case 'c': + p = smprint("", EARGF(usage())); + defcharset = charset(p); + free(p); + break; + case 'l': case 'w': + err = EARGF(usage()); + width = atoi(err); + if(width <= 0) + usage(); + break; + case 'u': + url = EARGF(usage()); + aflag++; + break; + default: + usage(); + }ARGEND + + err = nil; + file = ""; + if(argc == 0) + err = loadhtml(0); + else + for(i=0; err==nil && i +#include +#include +#include +#include +#include "dat.h" + +void* +emalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil) + error("can't malloc: %r"); + memset(p, 0, n); + return p; +} + +void* +erealloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == nil) + error("can't malloc: %r"); + return p; +} + +char* +estrdup(char *s) +{ + char *t; + + t = emalloc(strlen(s)+1); + strcpy(t, s); + return t; +} + +char* +estrstrdup(char *s, char *t) +{ + long ns, nt; + char *u; + + ns = strlen(s); + nt = strlen(t); + /* use malloc to avoid memset */ + u = malloc(ns+nt+1); + if(u == nil) + error("can't malloc: %r"); + memmove(u, s, ns); + memmove(u+ns, t, nt); + u[ns+nt] = '\0'; + return u; +} + +char* +eappend(char *s, char *sep, char *t) +{ + long ns, nsep, nt; + char *u; + + if(t == nil) + u = estrstrdup(s, sep); + else{ + ns = strlen(s); + nsep = strlen(sep); + nt = strlen(t); + /* use malloc to avoid memset */ + u = malloc(ns+nsep+nt+1); + if(u == nil) + error("can't malloc: %r"); + memmove(u, s, ns); + memmove(u+ns, sep, nsep); + memmove(u+ns+nsep, t, nt); + u[ns+nsep+nt] = '\0'; + } + free(s); + return u; +} + +char* +egrow(char *s, char *sep, char *t) +{ + s = eappend(s, sep, t); + free(t); + return s; +} + +void +error(char *fmt, ...) +{ + va_list arg; + char buf[256]; + Fmt f; + + fmtfdinit(&f, 2, buf, sizeof buf); + fmtprint(&f, "Mail: "); + va_start(arg, fmt); + fmtvprint(&f, fmt, arg); + va_end(arg); + fmtprint(&f, "\n"); + fmtfdflush(&f); + exits(fmt); +} + +void +growbytes(Bytes *b, char *s, long ns) +{ + if(b->nalloc < b->n + ns + 1){ + b->nalloc = b->n + ns + 8000; + /* use realloc to avoid memset */ + b->b = realloc(b->b, b->nalloc); + if(b->b == nil) + error("growbytes: can't realloc: %r"); + } + memmove(b->b+b->n, s, ns); + b->n += ns; + b->b[b->n] = '\0'; +} -- cgit v1.2.3