From e618c5b876155c71cf4f8aebe373fc9bc29716f9 Mon Sep 17 00:00:00 2001 From: Petter Rodhelind Date: Sun, 21 Feb 2021 17:40:05 +0100 Subject: RSS and HTML presentation of the feed. --- .gitignore | 2 +- README.md | 14 ++++++-- main.go | 110 ++++++++++++++++++++++++++++++++++++++++++++++++----------- parser.go | 46 +++++++++++++++++++++++-- templates.go | 78 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 224 insertions(+), 26 deletions(-) create mode 100644 templates.go diff --git a/.gitignore b/.gitignore index 8c047f4..f953b83 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ .DS_Store -fbrss +fbfeed diff --git a/README.md b/README.md index 97d30ce..17fd76b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,18 @@ -# fbrss +# fbfeed Web scraper that presents a public Facebook group's post as an RSS -feed. +feed or HTML page. This is particular good for those of you who do not use Facebook, but still end up in social networks and local communities where gatherings and events are all organized using Facebook. + +## Usage + +Compile and run. Host it somewhere or locally. + +For HTML: `/nameOfGroup` + +For RSS: `/nameOfGroup.rss` + +`nameOfGroup` is the @-name in Facebook. diff --git a/main.go b/main.go index 51f17e6..0e7dbba 100644 --- a/main.go +++ b/main.go @@ -3,42 +3,112 @@ package main import ( "fmt" "net/http" + "strings" "time" ) +var outputMode string + +type channel struct { + Title string + Link string + Description string + Items []*post +} + +func (c *channel) String() string { + var template string + switch outputMode { + case "html": + template = htmlRoot + case "rss": + template = rssRoot + } + + var s string + s = strings.Replace(template, "{{title}}", c.Title, -1) + s = strings.Replace(s, "{{link}}", c.Link, -1) + s = strings.Replace(s, "{{description}}", c.Description, -1) + var items string + for i := range c.Items { + item := c.Items[i].String() + item = strings.Replace(item, "{{link}}", c.Link, -1) + items += item + } + s = strings.Replace(s, "{{items}}", items, 1) + return s +} + type post struct { Time time.Time + Link string Content string } -func main() { - var groups []string - groups = append(groups, "") +func (p *post) String() string { + var template string + switch outputMode { + case "html": + template = htmlItem + case "rss": + template = rssItem + } + + var s string + // time format: Mon Jan 2 15:04:05 -0700 MST 2006 + s = strings.Replace(template, "{{time}}", p.Time.Format("Mon, 2 Jan 2006 15:04:05 MST"), 2) + s = strings.Replace(s, "{{content}}", p.Content, 1) + return s +} - for i := range groups { - url := "https://www.facebook.com/pg/" + groups[i] + "/posts/" +type handler struct{} - resp, err := http.Get(url) - if err != nil { - panic(err) - } - defer resp.Body.Close() +func (h handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + group := r.URL.Path[1:] + if strings.HasSuffix(group, ".rss") { + group = strings.TrimSuffix(group, ".rss") + outputMode = "rss" + } - ps, err := parse(resp.Body) - if err != nil { - panic(err) - } + c, err := fetch(group) + if err != nil { + http.Error(w, fmt.Sprintf("error: %s", err), 400) + return + } - fmt.Println(url) - fmt.Printf("%s", present(ps)) + if c == nil || len(c.Items) < 1 { + http.Error(w, fmt.Sprintf("%s", "group not found"), 400) + return } + + fmt.Fprintf(w, "%s\n", c.String()) +} + +func main() { + outputMode = "html" + fmt.Println("Serving: http://localhost:1212") + http.ListenAndServe(":1212", handler{}) } -func present(ps []post) (s string) { - for i := range ps { - s += ps[i].Time.String() + "\n" - s += ps[i].Content + "\n\n" +func fetch(group string) (c *channel, err error) { + if group == "" { + return } + url := "https://www.facebook.com/pg/" + group + "/posts/" + + resp, err := http.Get(url) + if err != nil { + return + } + defer resp.Body.Close() + + c, err = parse(resp.Body) + if err != nil { + return + } + + c.Link = url + return } diff --git a/parser.go b/parser.go index 119c689..8bae5a9 100644 --- a/parser.go +++ b/parser.go @@ -11,7 +11,8 @@ import ( // parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body) // and returns the separate posts found inside. -func parse(r io.Reader) (ps []post, err error) { +func parse(r io.Reader) (c *channel, err error) { + c = &channel{} doc, err := html.Parse(r) if err != nil { return @@ -19,13 +20,31 @@ func parse(r io.Reader) (ps []post, err error) { var f func(*html.Node) f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "meta" { + var prop string + for _, a := range n.Attr { + if a.Key == "property" { + prop = a.Val + } + if a.Key == "content" { + switch prop { + case "og:title": + c.Title = a.Val + case "og:description": + c.Description = a.Val + } + } + } + + } + if n.Type == html.ElementNode && n.Data == "div" { for _, a := range n.Attr { //if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") { if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { var p post parseNode(n, &p) - ps = append(ps, p) + c.Items = append(c.Items, &p) break } } @@ -43,6 +62,7 @@ func parse(r io.Reader) (ps []post, err error) { func parseNode(n *html.Node, p *post) { if n.Type == html.ElementNode { + // fetch time stamp from header if n.Data == "abbr" { for _, attr := range n.Attr { if attr.Key == "data-utime" { @@ -52,11 +72,31 @@ func parseNode(n *html.Node, p *post) { } } } + // skip post header with name of group and check-in + if n.Data == "h5" { + return + } + // skip "show more" and aria-hidden stuff + if n.Data == "span" { + for _, attr := range n.Attr { + if attr.Key == "class" && attr.Val == "text_exposed_hide" { + return + } + if attr.Key == "aria-hidden" && attr.Val == "true" { + return + } + } + } } + // fetch all pure text elements if n.Type == html.TextNode { + nl := "\n" // newlines + if n.Parent.Type == html.ElementNode && n.Parent.Data == "p" { + nl = "\n\n" + } if n.Data != "..." { - p.Content += n.Data + "\n" + p.Content += strings.Trim(n.Data, " ") + nl } } diff --git a/templates.go b/templates.go new file mode 100644 index 0000000..8f6453f --- /dev/null +++ b/templates.go @@ -0,0 +1,78 @@ +package main + +const rssRoot = ` +hejhej + + + {{title}} + {{link}} + {{description}} + fbfeed + {{items}} + + +` + +const rssItem = ` + + {{time}} + {{time}} + {{link}} + {{content}} + +` + +const htmlRoot = ` + + + + {{title}}· fbfeed + + + +
+

{{title}}

+ {{link}} +

{{description}}

+
+
+
+ {{items}} +
+
+ + + + +` + +const htmlItem = ` +
+{{time}} +
{{content}}
+
+` -- cgit v1.2.3