package main import ( "io" "strconv" "strings" "time" "golang.org/x/net/html" ) // parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body) // and returns the separate posts found inside. func parse(r io.Reader) (c *channel, err error) { c = &channel{} doc, err := html.Parse(r) if err != nil { return } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "meta" { var prop string for _, a := range n.Attr { if a.Key == "property" { prop = a.Val } if a.Key == "content" { switch prop { case "og:title": c.Title = a.Val case "og:description": c.Description = a.Val } } } } if n.Type == html.ElementNode && n.Data == "div" { for _, a := range n.Attr { if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { var p post parseNode(n, &p) c.Items = append(c.Items, &p) break } } } // loop further down for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return } func parseNode(n *html.Node, p *post) { if n.Type == html.ElementNode { switch n.Data { // fetch time stamp from header case "abbr": for _, attr := range n.Attr { if attr.Key == "data-utime" { unix, _ := strconv.ParseInt(attr.Val, 10, 64) p.Time = time.Unix(unix, 0) return } } // skip post header with name of group and check-in case "h5": return // skip "show more" and aria-hidden stuff case "span": fallthrough case "a": for _, attr := range n.Attr { if attr.Key == "class" && attr.Val == "text_exposed_hide" { return } if attr.Key == "aria-hidden" && attr.Val == "true" { return } } // parse paragraphs case "p": parseParagraph(n, p) p.Content += "\n\n" return // get images case "img": parseImage(n, p) return } } // fetch all pure text elements outside proper post paragraphs if n.Type == html.TextNode { p.Content += n.Data + "\n" } // loop deeper for c := n.FirstChild; c != nil; c = c.NextSibling { parseNode(c, p) } } func parseParagraph(n *html.Node, p *post) { // fetch all pure text elements if n.Type == html.TextNode { p.Content += n.Data } // loop deeper for c := n.FirstChild; c != nil; c = c.NextSibling { parseParagraph(c, p) } } func parseImage(n *html.Node, p *post) { img := &image{} for _, attr := range n.Attr { switch attr.Key { case "src": img.Source = attr.Val case "alt": fallthrough case "title": fallthrough case "aria-label": img.Caption = attr.Val } } p.Images = append(p.Images, img) }