RSS and HTML presentation of the feed.

author: Petter Rodhelind <petter.rodhelind@gmail.com> 2021-02-21 17:40:05 +0100
committer: Petter Rodhelind <petter.rodhelind@gmail.com> 2021-02-21 17:40:05 +0100
commit: e618c5b876155c71cf4f8aebe373fc9bc29716f9 (patch)
tree: 6531fb7c0fa367c6a1125e1a421f5ee90b577ee8
parent: dc79f4c18a76f5f907b3061e2e08ae4923da40e2 (diff)
download: fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.tar.gz
fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.tar.bz2
fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.zip
5 files changed, 224 insertions, 26 deletions
diff --git a/.gitignore b/.gitignore
index 8c047f4..f953b83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 .DS_Store
-fbrss
+fbfeed
diff --git a/README.md b/README.md
index 97d30ce..17fd76b 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,18 @@
-# fbrss
+# fbfeed
 
 Web scraper that presents a public Facebook group's post as an RSS
-feed.
+feed or HTML page.
 
 This is particular good for those of you who do not use Facebook,
 but still end up in social networks and local communities where
 gatherings and events are all organized using Facebook.
+
+## Usage
+
+Compile and run. Host it somewhere or locally.
+
+For HTML: `/nameOfGroup`
+
+For RSS: `/nameOfGroup.rss`
+
+`nameOfGroup` is the @-name in Facebook.
diff --git a/main.go b/main.go
index 51f17e6..0e7dbba 100644
--- a/main.go
+++ b/main.go
@@ -3,42 +3,112 @@ package main
 import (
 	"fmt"
 	"net/http"
+	"strings"
 	"time"
 )
 
+var outputMode string
+
+type channel struct {
+	Title       string
+	Link        string
+	Description string
+	Items       []*post
+}
+
+func (c *channel) String() string {
+	var template string
+	switch outputMode {
+	case "html":
+		template = htmlRoot
+	case "rss":
+		template = rssRoot
+	}
+
+	var s string
+	s = strings.Replace(template, "{{title}}", c.Title, -1)
+	s = strings.Replace(s, "{{link}}", c.Link, -1)
+	s = strings.Replace(s, "{{description}}", c.Description, -1)
+	var items string
+	for i := range c.Items {
+		item := c.Items[i].String()
+		item = strings.Replace(item, "{{link}}", c.Link, -1)
+		items += item
+	}
+	s = strings.Replace(s, "{{items}}", items, 1)
+	return s
+}
+
 type post struct {
 	Time    time.Time
+	Link    string
 	Content string
 }
 
-func main() {
-	var groups []string
-	groups = append(groups, "")
+func (p *post) String() string {
+	var template string
+	switch outputMode {
+	case "html":
+		template = htmlItem
+	case "rss":
+		template = rssItem
+	}
+
+	var s string
+	// time format: Mon Jan 2 15:04:05 -0700 MST 2006
+	s = strings.Replace(template, "{{time}}", p.Time.Format("Mon, 2 Jan 2006 15:04:05 MST"), 2)
+	s = strings.Replace(s, "{{content}}", p.Content, 1)
+	return s
+}
 
-	for i := range groups {
-		url := "https://www.facebook.com/pg/" + groups[i] + "/posts/"
+type handler struct{}
 
-		resp, err := http.Get(url)
-		if err != nil {
-			panic(err)
-		}
-		defer resp.Body.Close()
+func (h handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	group := r.URL.Path[1:]
+	if strings.HasSuffix(group, ".rss") {
+		group = strings.TrimSuffix(group, ".rss")
+		outputMode = "rss"
+	}
 
-		ps, err := parse(resp.Body)
-		if err != nil {
-			panic(err)
-		}
+	c, err := fetch(group)
+	if err != nil {
+		http.Error(w, fmt.Sprintf("error: %s", err), 400)
+		return
+	}
 
-		fmt.Println(url)
-		fmt.Printf("%s", present(ps))
+	if c == nil || len(c.Items) < 1 {
+		http.Error(w, fmt.Sprintf("%s", "group not found"), 400)
+		return
 	}
+
+	fmt.Fprintf(w, "%s\n", c.String())
+}
+
+func main() {
+	outputMode = "html"
+	fmt.Println("Serving: http://localhost:1212")
+	http.ListenAndServe(":1212", handler{})
 }
 
-func present(ps []post) (s string) {
-	for i := range ps {
-		s += ps[i].Time.String() + "\n"
-		s += ps[i].Content + "\n\n"
+func fetch(group string) (c *channel, err error) {
+	if group == "" {
+		return
 	}
 
+	url := "https://www.facebook.com/pg/" + group + "/posts/"
+
+	resp, err := http.Get(url)
+	if err != nil {
+		return
+	}
+	defer resp.Body.Close()
+
+	c, err = parse(resp.Body)
+	if err != nil {
+		return
+	}
+
+	c.Link = url
+
 	return
 }
diff --git a/parser.go b/parser.go
index 119c689..8bae5a9 100644
--- a/parser.go
+++ b/parser.go
@@ -11,7 +11,8 @@ import (
 
 // parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body)
 // and returns the separate posts found inside.
-func parse(r io.Reader) (ps []post, err error) {
+func parse(r io.Reader) (c *channel, err error) {
+	c = &channel{}
 	doc, err := html.Parse(r)
 	if err != nil {
 		return
@@ -19,13 +20,31 @@ func parse(r io.Reader) (ps []post, err error) {
 
 	var f func(*html.Node)
 	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "meta" {
+			var prop string
+			for _, a := range n.Attr {
+				if a.Key == "property" {
+					prop = a.Val
+				}
+				if a.Key == "content" {
+					switch prop {
+					case "og:title":
+						c.Title = a.Val
+					case "og:description":
+						c.Description = a.Val
+					}
+				}
+			}
+
+		}
+
 		if n.Type == html.ElementNode && n.Data == "div" {
 			for _, a := range n.Attr {
 				//if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") {
 				if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") {
 					var p post
 					parseNode(n, &p)
-					ps = append(ps, p)
+					c.Items = append(c.Items, &p)
 					break
 				}
 			}
@@ -43,6 +62,7 @@ func parse(r io.Reader) (ps []post, err error) {
 
 func parseNode(n *html.Node, p *post) {
 	if n.Type == html.ElementNode {
+		// fetch time stamp from header
 		if n.Data == "abbr" {
 			for _, attr := range n.Attr {
 				if attr.Key == "data-utime" {
@@ -52,11 +72,31 @@ func parseNode(n *html.Node, p *post) {
 				}
 			}
 		}
+		// skip post header with name of group and check-in
+		if n.Data == "h5" {
+			return
+		}
+		// skip "show more" and aria-hidden stuff
+		if n.Data == "span" {
+			for _, attr := range n.Attr {
+				if attr.Key == "class" && attr.Val == "text_exposed_hide" {
+					return
+				}
+				if attr.Key == "aria-hidden" && attr.Val == "true" {
+					return
+				}
+			}
+		}
 	}
 
+	// fetch all pure text elements
 	if n.Type == html.TextNode {
+		nl := "\n" // newlines
+		if n.Parent.Type == html.ElementNode && n.Parent.Data == "p" {
+			nl = "\n\n"
+		}
 		if n.Data != "..." {
-			p.Content += n.Data + "\n"
+			p.Content += strings.Trim(n.Data, " ") + nl
 		}
 	}
 
diff --git a/templates.go b/templates.go
new file mode 100644
index 0000000..8f6453f
--- /dev/null
+++ b/templates.go
@@ -0,0 +1,78 @@
+package main
+
+const rssRoot = `
+hejhej<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>{{title}}</title>
+    <link>{{link}}</link>
+    <description>{{description}}</description>
+    <generator>fbfeed</generator>
+    {{items}}
+  </channel>
+</rss>
+`
+
+const rssItem = `
+<item>
+  <title>{{time}}</title>
+  <pubDate>{{time}}</pubDate>
+  <link>{{link}}</link>
+  <description>{{content}}</description>
+</item>
+`
+
+const htmlRoot = `
+<!DOCTYPE html>
+<html>
+<head>
+ 	<title>{{title}}&middot; fbfeed</title>
+  	<style type="text/css">
+	body {
+		font-size: 14pt;
+		line-height: 1.2;
+		max-width: 1200px;
+		margin: 0 auto;
+		padding: 0.5em;
+		word-wrap: break-word;
+	}
+	article {
+		margin-bottom: 1em;
+	}
+	article datetime {
+		font-weight: bold;
+	}
+	article pre {
+		margin: 0;
+		padding-left: 1em;
+		font-family: inherit;
+		word-wrap: break-word;
+		white-space: pre-line;
+	}
+  	</style>
+</head>
+<body>
+	<header>
+		<h1>{{title}}</h1>
+		<a href="{{link}}">{{link}}</a>
+		<p>{{description}}</p>
+	</header>
+	<hr>
+	<main>
+		{{items}}
+	</main>
+	<hr>
+	<footer>
+		<small>Generated by fbfeed</small>
+	</footer>
+</body>
+
+</html>
+`
+
+const htmlItem = `
+<article>
+<datetime>{{time}}</datetime>
+<pre>{{content}}</pre>
+</article>
+`
author	Petter Rodhelind <petter.rodhelind@gmail.com>	2021-02-21 17:40:05 +0100
committer	Petter Rodhelind <petter.rodhelind@gmail.com>	2021-02-21 17:40:05 +0100
commit	e618c5b876155c71cf4f8aebe373fc9bc29716f9 (patch)
tree	6531fb7c0fa367c6a1125e1a421f5ee90b577ee8
parent	dc79f4c18a76f5f907b3061e2e08ae4923da40e2 (diff)
download	fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.tar.gz fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.tar.bz2 fbfeed-e618c5b876155c71cf4f8aebe373fc9bc29716f9.zip