diff options
Diffstat (limited to 'parser.go')
-rw-r--r-- | parser.go | 39 |
1 files changed, 31 insertions, 8 deletions
@@ -40,7 +40,6 @@ func parse(r io.Reader) (c *channel, err error) { if n.Type == html.ElementNode && n.Data == "div" { for _, a := range n.Attr { - //if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") { if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { var p post parseNode(n, &p) @@ -62,8 +61,9 @@ func parse(r io.Reader) (c *channel, err error) { func parseNode(n *html.Node, p *post) { if n.Type == html.ElementNode { + switch n.Data { // fetch time stamp from header - if n.Data == "abbr" { + case "abbr": for _, attr := range n.Attr { if attr.Key == "data-utime" { unix, _ := strconv.ParseInt(attr.Val, 10, 64) @@ -71,13 +71,15 @@ func parseNode(n *html.Node, p *post) { return } } - } + // skip post header with name of group and check-in - if n.Data == "h5" { + case "h5": return - } + // skip "show more" and aria-hidden stuff - if n.Data == "span" { + case "span": + fallthrough + case "a": for _, attr := range n.Attr { if attr.Key == "class" && attr.Val == "text_exposed_hide" { return @@ -86,13 +88,17 @@ func parseNode(n *html.Node, p *post) { return } } - } // parse paragraphs - if n.Data == "p" { + case "p": parseParagraph(n, p) p.Content += "\n\n" return + + // get images + case "img": + parseImage(n, p) + return } } @@ -118,3 +124,20 @@ func parseParagraph(n *html.Node, p *post) { parseParagraph(c, p) } } + +func parseImage(n *html.Node, p *post) { + img := &image{} + for _, attr := range n.Attr { + switch attr.Key { + case "src": + img.Source = attr.Val + case "alt": + fallthrough + case "title": + fallthrough + case "aria-label": + img.Caption = attr.Val + } + } + p.Images = append(p.Images, img) +} |