package main import ( "io" "net/url" "strconv" "strings" "time" "golang.org/x/net/html" ) // parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body) // and returns the separate posts found inside. func parse(r io.Reader) (c *channel, err error) { c = &channel{} doc, err := html.Parse(r) if err != nil { return } var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "meta" { var prop string for _, a := range n.Attr { if a.Key == "property" { prop = a.Val } if a.Key == "content" { switch prop { case "og:title": c.Title = a.Val case "og:description": c.Description = a.Val } } } } if n.Type == html.ElementNode && n.Data == "div" { for _, a := range n.Attr { if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { var p post parseNode(n, &p) c.Items = append(c.Items, &p) break } } } // loop further down for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return } func parseNode(n *html.Node, p *post) { if n.Type == html.ElementNode { switch n.Data { // fetch time stamp from header case "abbr": for _, attr := range n.Attr { if attr.Key == "data-utime" { unix, _ := strconv.ParseInt(attr.Val, 10, 64) p.Time = time.Unix(unix, 0) return } } // skip post header with name of group and check-in case "h5": return // skip "show more", aria-hidden and accessible stuff case "span": fallthrough case "a": fallthrough case "div": for _, attr := range n.Attr { if attr.Key == "class" && attr.Val == "text_exposed_hide" { return } if attr.Key == "class" && strings.Contains(attr.Val, "accessible_elem") { return } if attr.Key == "aria-hidden" && attr.Val == "true" { return } } // parse paragraphs case "p": parseParagraph(n, p) p.Content += "\n\n" return // get images case "img": parseImage(n, p) return } } // fetch all pure text elements outside proper post paragraphs if n.Type == html.TextNode { p.Content += n.Data if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" { parseLink(n.Parent, p) } p.Content += "\n" } // loop deeper for c := n.FirstChild; c != nil; c = c.NextSibling { parseNode(c, p) } } func parseParagraph(n *html.Node, p *post) { // fetch all pure text elements if n.Type == html.TextNode { p.Content += n.Data if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" { parseLink(n.Parent, p) return } } // normal linebreak within paragraphs if n.Type == html.ElementNode && n.Data == "br" { p.Content += "\n" return } // loop deeper for c := n.FirstChild; c != nil; c = c.NextSibling { parseParagraph(c, p) } } func parseImage(n *html.Node, p *post) { img := &image{} for _, attr := range n.Attr { switch attr.Key { case "src": img.Source = attr.Val case "alt": fallthrough case "title": fallthrough case "aria-label": img.Caption = attr.Val } } p.Images = append(p.Images, img) } func parseLink(n *html.Node, p *post) { for _, attr := range n.Attr { switch attr.Key { case "href": urlraw, _ := url.PathUnescape(attr.Val) url, _ := url.Parse(urlraw) external := url.Query().Get("u") if external != "" { p.Content += " [" + external + "]" } else { base := strings.Split(urlraw, "?")[0] if base[0] == '/' { base = "https://www.facebook.com" + base } p.Content += " [" + base + "]" } } } }