parser.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

package main

import (
	"io"
	"strconv"
	"strings"
	"time"

	"golang.org/x/net/html"
)

// parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body)
// and returns the separate posts found inside.
func parse(r io.Reader) (c *channel, err error) {
	c = &channel{}
	doc, err := html.Parse(r)
	if err != nil {
		return
	}

	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "meta" {
			var prop string
			for _, a := range n.Attr {
				if a.Key == "property" {
					prop = a.Val
				}
				if a.Key == "content" {
					switch prop {
					case "og:title":
						c.Title = a.Val
					case "og:description":
						c.Description = a.Val
					}
				}
			}

		}

		if n.Type == html.ElementNode && n.Data == "div" {
			for _, a := range n.Attr {
				//if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") {
				if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") {
					var p post
					parseNode(n, &p)
					c.Items = append(c.Items, &p)
					break
				}
			}
		}

		// loop further down
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)

	return
}

func parseNode(n *html.Node, p *post) {
	if n.Type == html.ElementNode {
		// fetch time stamp from header
		if n.Data == "abbr" {
			for _, attr := range n.Attr {
				if attr.Key == "data-utime" {
					unix, _ := strconv.ParseInt(attr.Val, 10, 64)
					p.Time = time.Unix(unix, 0)
					return
				}
			}
		}
		// skip post header with name of group and check-in
		if n.Data == "h5" {
			return
		}
		// skip "show more" and aria-hidden stuff
		if n.Data == "span" {
			for _, attr := range n.Attr {
				if attr.Key == "class" && attr.Val == "text_exposed_hide" {
					return
				}
				if attr.Key == "aria-hidden" && attr.Val == "true" {
					return
				}
			}
		}

		// parse paragraphs
		if n.Data == "p" {
			parseParagraph(n, p)
			p.Content += "\n\n"
			return
		}
	}

	// fetch all pure text elements outside proper post paragraphs
	if n.Type == html.TextNode {
		p.Content += n.Data + "\n"
	}

	// loop deeper
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		parseNode(c, p)
	}
}

func parseParagraph(n *html.Node, p *post) {
	// fetch all pure text elements
	if n.Type == html.TextNode {
		p.Content += n.Data
	}

	// loop deeper
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		parseParagraph(c, p)
	}
}