diff options
Diffstat (limited to 'parser.go')
-rw-r--r-- | parser.go | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..119c689 --- /dev/null +++ b/parser.go @@ -0,0 +1,67 @@ +package main + +import ( + "io" + "strconv" + "strings" + "time" + + "golang.org/x/net/html" +) + +// parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body) +// and returns the separate posts found inside. +func parse(r io.Reader) (ps []post, err error) { + doc, err := html.Parse(r) + if err != nil { + return + } + + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "div" { + for _, a := range n.Attr { + //if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") { + if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { + var p post + parseNode(n, &p) + ps = append(ps, p) + break + } + } + } + + // loop further down + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + + return +} + +func parseNode(n *html.Node, p *post) { + if n.Type == html.ElementNode { + if n.Data == "abbr" { + for _, attr := range n.Attr { + if attr.Key == "data-utime" { + unix, _ := strconv.ParseInt(attr.Val, 10, 64) + p.Time = time.Unix(unix, 0) + return + } + } + } + } + + if n.Type == html.TextNode { + if n.Data != "..." { + p.Content += n.Data + "\n" + } + } + + // loop deeper + for c := n.FirstChild; c != nil; c = c.NextSibling { + parseNode(c, p) + } +} |