1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
package main
import (
"io"
"strconv"
"strings"
"time"
"golang.org/x/net/html"
)
// parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body)
// and returns the separate posts found inside.
func parse(r io.Reader) (c *channel, err error) {
c = &channel{}
doc, err := html.Parse(r)
if err != nil {
return
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "meta" {
var prop string
for _, a := range n.Attr {
if a.Key == "property" {
prop = a.Val
}
if a.Key == "content" {
switch prop {
case "og:title":
c.Title = a.Val
case "og:description":
c.Description = a.Val
}
}
}
}
if n.Type == html.ElementNode && n.Data == "div" {
for _, a := range n.Attr {
//if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") {
if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") {
var p post
parseNode(n, &p)
c.Items = append(c.Items, &p)
break
}
}
}
// loop further down
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return
}
func parseNode(n *html.Node, p *post) {
if n.Type == html.ElementNode {
// fetch time stamp from header
if n.Data == "abbr" {
for _, attr := range n.Attr {
if attr.Key == "data-utime" {
unix, _ := strconv.ParseInt(attr.Val, 10, 64)
p.Time = time.Unix(unix, 0)
return
}
}
}
// skip post header with name of group and check-in
if n.Data == "h5" {
return
}
// skip "show more" and aria-hidden stuff
if n.Data == "span" {
for _, attr := range n.Attr {
if attr.Key == "class" && attr.Val == "text_exposed_hide" {
return
}
if attr.Key == "aria-hidden" && attr.Val == "true" {
return
}
}
}
// parse paragraphs
if n.Data == "p" {
parseParagraph(n, p)
p.Content += "\n\n"
return
}
}
// fetch all pure text elements outside proper post paragraphs
if n.Type == html.TextNode {
p.Content += n.Data + "\n"
}
// loop deeper
for c := n.FirstChild; c != nil; c = c.NextSibling {
parseNode(c, p)
}
}
func parseParagraph(n *html.Node, p *post) {
// fetch all pure text elements
if n.Type == html.TextNode {
p.Content += n.Data
}
// loop deeper
for c := n.FirstChild; c != nil; c = c.NextSibling {
parseParagraph(c, p)
}
}
|