aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/htmlfmt
diff options
context:
space:
mode:
authorwkj <devnull@localhost>2004-04-06 19:06:52 +0000
committerwkj <devnull@localhost>2004-04-06 19:06:52 +0000
commit7cf289ca89a7416999ae02330236042b0d37e3db (patch)
tree796d1363a7a53c72c28b199758ee674f1326a510 /src/cmd/htmlfmt
parent3e3817f7c86658f60715dd93768eaf8285807985 (diff)
downloadplan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.gz
plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.tar.bz2
plan9port-7cf289ca89a7416999ae02330236042b0d37e3db.zip
Import version of libhtml that might actually work with ANSI C.
Diffstat (limited to 'src/cmd/htmlfmt')
-rw-r--r--src/cmd/htmlfmt/dat.h50
-rw-r--r--src/cmd/htmlfmt/html.c331
-rw-r--r--src/cmd/htmlfmt/main.c71
-rw-r--r--src/cmd/htmlfmt/mkfile30
-rw-r--r--src/cmd/htmlfmt/util.c120
5 files changed, 602 insertions, 0 deletions
diff --git a/src/cmd/htmlfmt/dat.h b/src/cmd/htmlfmt/dat.h
new file mode 100644
index 00000000..f3b05605
--- /dev/null
+++ b/src/cmd/htmlfmt/dat.h
@@ -0,0 +1,50 @@
+typedef struct Bytes Bytes;
+typedef struct URLwin URLwin;
+
+enum
+{
+ STACK = 8192,
+ EVENTSIZE = 256,
+};
+
+struct Bytes
+{
+ uchar *b;
+ long n;
+ long nalloc;
+};
+
+struct URLwin
+{
+ int infd;
+ int outfd;
+ int type;
+
+ char *url;
+ Item *items;
+ Docinfo *docinfo;
+};
+
+extern char* url;
+extern int aflag;
+extern int width;
+extern int defcharset;
+
+extern char* loadhtml(int);
+
+extern char* readfile(char*, char*, int*);
+extern int charset(char*);
+extern void* emalloc(ulong);
+extern char* estrdup(char*);
+extern char* estrstrdup(char*, char*);
+extern char* egrow(char*, char*, char*);
+extern char* eappend(char*, char*, char*);
+extern void error(char*, ...);
+
+extern void growbytes(Bytes*, char*, long);
+
+extern void rendertext(URLwin*, Bytes*);
+extern void rerender(URLwin*);
+extern void freeurlwin(URLwin*);
+
+#pragma varargck argpos error 1
diff --git a/src/cmd/htmlfmt/html.c b/src/cmd/htmlfmt/html.c
new file mode 100644
index 00000000..4f2e436f
--- /dev/null
+++ b/src/cmd/htmlfmt/html.c
@@ -0,0 +1,331 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <draw.h>
+#include <regexp.h>
+#include <html.h>
+#include <ctype.h>
+#include "dat.h"
+
+char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
+Reprog *urlprog;
+
+int inword = 0;
+int col = 0;
+int wordi = 0;
+
+char*
+loadhtml(int fd)
+{
+ URLwin *u;
+ Bytes *b;
+ int n;
+ char buf[4096];
+
+ u = emalloc(sizeof(URLwin));
+ u->infd = fd;
+ u->outfd = 1;
+ u->url = estrdup(url);
+ u->type = TextHtml;
+
+ b = emalloc(sizeof(Bytes));
+ while((n = read(fd, buf, sizeof buf)) > 0)
+ growbytes(b, buf, n);
+ if(b->b == nil)
+ return nil; /* empty file */
+ rendertext(u, b);
+ freeurlwin(u);
+ return nil;
+}
+
+char*
+runetobyte(Rune *r, int n)
+{
+ char *s;
+
+ if(n == 0)
+ return emalloc(1);
+ s = smprint("%.*S", n, r);
+ if(s == nil)
+ error("malloc failed");
+ return s;
+}
+
+int
+closingpunct(int c)
+{
+ return strchr(".,:;'\")]}>!?", c) != nil;
+}
+
+void
+emitword(Bytes *b, Rune *r, int nr)
+{
+ char *s;
+ int space;
+
+ if(nr == 0)
+ return;
+ s = smprint("%.*S", nr, r);
+ space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
+ if(col>0 && col+space+nr > width){
+ growbytes(b, "\n", 1);
+ space = 0;
+ col = 0;
+ }
+ if(space && col>0){
+ growbytes(b, " ", 1);
+ col++;
+ }
+ growbytes(b, s, strlen(s));
+ col += nr;
+ free(s);
+ inword = 0;
+}
+
+void
+renderrunes(Bytes *b, Rune *r)
+{
+ int i, n;
+
+ n = runestrlen(r);
+ for(i=0; i<n; i++){
+ switch(r[i]){
+ case '\n':
+ if(inword)
+ emitword(b, r+wordi, i-wordi);
+ col = 0;
+ if(b->n == 0)
+ break; /* don't start with blank lines */
+ if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
+ growbytes(b, "\n", 1);
+ break;
+ case ' ':
+ if(inword)
+ emitword(b, r+wordi, i-wordi);
+ break;
+ default:
+ if(!inword)
+ wordi = i;
+ inword = 1;
+ break;
+ }
+ }
+ if(inword)
+ emitword(b, r+wordi, i-wordi);
+}
+
+void
+renderbytes(Bytes *b, char *fmt, ...)
+{
+ Rune *r;
+ va_list arg;
+
+ va_start(arg, fmt);
+ r = runevsmprint(fmt, arg);
+ va_end(arg);
+ renderrunes(b, r);
+ free(r);
+}
+
+char*
+baseurl(char *url)
+{
+ char *base, *slash;
+ Resub rs[10];
+
+ if(url == nil)
+ return nil;
+ if(urlprog == nil){
+ urlprog = regcomp(urlexpr);
+ if(urlprog == nil)
+ error("can't compile URL regexp");
+ }
+ memset(rs, 0, sizeof rs);
+ if(regexec(urlprog, url, rs, nelem(rs)) == 0)
+ return nil;
+ base = estrdup(url);
+ slash = strrchr(base, '/');
+ if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p])
+ *slash = '\0';
+ else
+ base[rs[0].e.p-rs[0].s.p] = '\0';
+ return base;
+}
+
+char*
+fullurl(URLwin *u, Rune *rhref)
+{
+ char *base, *href, *hrefbase;
+ char *result;
+
+ if(rhref == nil)
+ return estrdup("NULL URL");
+ href = runetobyte(rhref, runestrlen(rhref));
+ hrefbase = baseurl(href);
+ result = nil;
+ if(hrefbase==nil && (base = baseurl(u->url))!=nil){
+ result = estrdup(base);
+ if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
+ result = eappend(result, "/", "");
+ free(base);
+ }
+ if(href){
+ if(result)
+ result = eappend(result, "", href);
+ else
+ result = estrdup(href);
+ }
+ free(hrefbase);
+ if(result == nil)
+ return estrdup("***unknown***");
+ return result;
+}
+
+void
+render(URLwin *u, Bytes *t, Item *items, int curanchor)
+{
+ Item *il;
+ Itext *it;
+ Ifloat *ifl;
+ Ispacer *is;
+ Itable *ita;
+ Iimage *im;
+ Anchor *a;
+ Table *tab;
+ Tablecell *cell;
+ char *href;
+
+ inword = 0;
+ col = 0;
+ wordi = 0;
+
+ for(il=items; il!=nil; il=il->next){
+ if(il->state & IFbrk)
+ renderbytes(t, "\n");
+ if(il->state & IFbrksp)
+ renderbytes(t, "\n");
+
+ switch(il->tag){
+ case Itexttag:
+ it = (Itext*)il;
+ renderrunes(t, it->s);
+ break;
+ case Iruletag:
+ if(t->n>0 && t->b[t->n-1]!='\n')
+ renderbytes(t, "\n");
+ renderbytes(t, "=======\n");
+ break;
+ case Iimagetag:
+ if(!aflag)
+ break;
+ im = (Iimage*)il;
+ if(im->imsrc){
+ href = fullurl(u, im->imsrc);
+ renderbytes(t, "[image %s]", href);
+ free(href);
+ }
+ break;
+ case Iformfieldtag:
+ if(aflag)
+ renderbytes(t, "[formfield]");
+ break;
+ case Itabletag:
+ ita = (Itable*)il;
+ tab = ita->table;
+ for(cell=tab->cells; cell!=nil; cell=cell->next){
+ render(u, t, cell->content, curanchor);
+ }
+ if(t->n>0 && t->b[t->n-1]!='\n')
+ renderbytes(t, "\n");
+ break;
+ case Ifloattag:
+ ifl = (Ifloat*)il;
+ render(u, t, ifl->item, curanchor);
+ break;
+ case Ispacertag:
+ is = (Ispacer*)il;
+ if(is->spkind != ISPnull)
+ renderbytes(t, " ");
+ break;
+ default:
+ error("unknown item tag %d\n", il->tag);
+ }
+ if(il->anchorid != 0 && il->anchorid!=curanchor){
+ for(a=u->docinfo->anchors; a!=nil; a=a->next)
+ if(aflag && a->index == il->anchorid){
+ href = fullurl(u, a->href);
+ renderbytes(t, "[%s]", href);
+ free(href);
+ break;
+ }
+ curanchor = il->anchorid;
+ }
+ }
+ if(t->n>0 && t->b[t->n-1]!='\n')
+ renderbytes(t, "\n");
+}
+
+void
+rerender(URLwin *u)
+{
+ Bytes *t;
+
+ t = emalloc(sizeof(Bytes));
+
+ render(u, t, u->items, 0);
+
+ if(t->n)
+ write(u->outfd, (char*)t->b, t->n);
+ free(t->b);
+ free(t);
+}
+
+/*
+ * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
+ * of the document (cistrstr only looks at first somewhat bytes).
+ */
+int
+charset(char *s)
+{
+ char *meta, *emeta, *charset;
+
+ if(defcharset == 0)
+ defcharset = ISO_8859_1;
+ meta = cistrstr(s, "<meta");
+ if(meta == nil)
+ return defcharset;
+ for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
+ ;
+ charset = cistrstr(s, "charset=");
+ if(charset == nil)
+ return defcharset;
+ charset += 8;
+ if(*charset == '"')
+ charset++;
+ if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
+ return UTF_8;
+ return defcharset;
+}
+
+void
+rendertext(URLwin *u, Bytes *b)
+{
+ Rune *rurl;
+
+ rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
+ u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
+// free(rurl);
+
+ rerender(u);
+}
+
+
+void
+freeurlwin(URLwin *u)
+{
+ freeitems(u->items);
+ u->items = nil;
+ freedocinfo(u->docinfo);
+ u->docinfo = nil;
+ free(u);
+}
diff --git a/src/cmd/htmlfmt/main.c b/src/cmd/htmlfmt/main.c
new file mode 100644
index 00000000..f85bbb48
--- /dev/null
+++ b/src/cmd/htmlfmt/main.c
@@ -0,0 +1,71 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <draw.h>
+#include <html.h>
+#include "dat.h"
+
+char *url = "";
+int aflag;
+int width = 70;
+int defcharset;
+
+void
+usage(void)
+{
+ fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n");
+ exits("usage");
+}
+
+void
+main(int argc, char *argv[])
+{
+ int i, fd;
+ char *p, *err, *file;
+ char errbuf[ERRMAX];
+
+ ARGBEGIN{
+ case 'a':
+ aflag++;
+ break;
+ case 'c':
+ p = smprint("<meta charset=\"%s\">", EARGF(usage()));
+ defcharset = charset(p);
+ free(p);
+ break;
+ case 'l': case 'w':
+ err = EARGF(usage());
+ width = atoi(err);
+ if(width <= 0)
+ usage();
+ break;
+ case 'u':
+ url = EARGF(usage());
+ aflag++;
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ err = nil;
+ file = "<stdin>";
+ if(argc == 0)
+ err = loadhtml(0);
+ else
+ for(i=0; err==nil && i<argc; i++){
+ file = argv[i];
+ fd = open(file, OREAD);
+ if(fd < 0){
+ errstr(errbuf, sizeof errbuf);
+ err = errbuf;
+ break;
+ }
+ err = loadhtml(fd);
+ close(fd);
+ if(err)
+ break;
+ }
+ if(err)
+ fprint(2, "htmlfmt: processing %s: %s\n", file, err);
+ exits(err);
+}
diff --git a/src/cmd/htmlfmt/mkfile b/src/cmd/htmlfmt/mkfile
new file mode 100644
index 00000000..5b263532
--- /dev/null
+++ b/src/cmd/htmlfmt/mkfile
@@ -0,0 +1,30 @@
+<$SYS9/$systype/$objtype/mkfile
+
+TARG=htmlfmt
+OFILES=\
+ main.$O\
+ html.$O\
+ util.$O\
+
+HFILES=\
+ dat.h\
+ $SYS9/sys/include/html.h\
+
+LIB=$SYS9/$systype/$objtype/lib/libbio.a\
+ $SYS9/$systype/$objtype/lib/libregexp.a\
+ $SYS9/$systype/$objtype/lib/libhtml.a\
+ $SYS9/$systype/$objtype/lib/lib9c.a
+
+BIN=$SYS9/$systype/$objtype/bin
+
+UPDATE=\
+ mkfile\
+ $HFILES\
+ ${OFILES:%.$O=%.c}
+
+<$SYS9/sys/src/cmd/mkone
+
+CFLAGS=$CFLAGS
+
+#$O.out: $OFILES
+# $LD -o $target $LDFLAGS $OFILES
diff --git a/src/cmd/htmlfmt/util.c b/src/cmd/htmlfmt/util.c
new file mode 100644
index 00000000..b22b0ab5
--- /dev/null
+++ b/src/cmd/htmlfmt/util.c
@@ -0,0 +1,120 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <draw.h>
+#include <html.h>
+#include "dat.h"
+
+void*
+emalloc(ulong n)
+{
+ void *p;
+
+ p = malloc(n);
+ if(p == nil)
+ error("can't malloc: %r");
+ memset(p, 0, n);
+ return p;
+}
+
+void*
+erealloc(void *p, ulong n)
+{
+ p = realloc(p, n);
+ if(p == nil)
+ error("can't malloc: %r");
+ return p;
+}
+
+char*
+estrdup(char *s)
+{
+ char *t;
+
+ t = emalloc(strlen(s)+1);
+ strcpy(t, s);
+ return t;
+}
+
+char*
+estrstrdup(char *s, char *t)
+{
+ long ns, nt;
+ char *u;
+
+ ns = strlen(s);
+ nt = strlen(t);
+ /* use malloc to avoid memset */
+ u = malloc(ns+nt+1);
+ if(u == nil)
+ error("can't malloc: %r");
+ memmove(u, s, ns);
+ memmove(u+ns, t, nt);
+ u[ns+nt] = '\0';
+ return u;
+}
+
+char*
+eappend(char *s, char *sep, char *t)
+{
+ long ns, nsep, nt;
+ char *u;
+
+ if(t == nil)
+ u = estrstrdup(s, sep);
+ else{
+ ns = strlen(s);
+ nsep = strlen(sep);
+ nt = strlen(t);
+ /* use malloc to avoid memset */
+ u = malloc(ns+nsep+nt+1);
+ if(u == nil)
+ error("can't malloc: %r");
+ memmove(u, s, ns);
+ memmove(u+ns, sep, nsep);
+ memmove(u+ns+nsep, t, nt);
+ u[ns+nsep+nt] = '\0';
+ }
+ free(s);
+ return u;
+}
+
+char*
+egrow(char *s, char *sep, char *t)
+{
+ s = eappend(s, sep, t);
+ free(t);
+ return s;
+}
+
+void
+error(char *fmt, ...)
+{
+ va_list arg;
+ char buf[256];
+ Fmt f;
+
+ fmtfdinit(&f, 2, buf, sizeof buf);
+ fmtprint(&f, "Mail: ");
+ va_start(arg, fmt);
+ fmtvprint(&f, fmt, arg);
+ va_end(arg);
+ fmtprint(&f, "\n");
+ fmtfdflush(&f);
+ exits(fmt);
+}
+
+void
+growbytes(Bytes *b, char *s, long ns)
+{
+ if(b->nalloc < b->n + ns + 1){
+ b->nalloc = b->n + ns + 8000;
+ /* use realloc to avoid memset */
+ b->b = realloc(b->b, b->nalloc);
+ if(b->b == nil)
+ error("growbytes: can't realloc: %r");
+ }
+ memmove(b->b+b->n, s, ns);
+ b->n += ns;
+ b->b[b->n] = '\0';
+}