diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | README.md | 14 | ||||
-rw-r--r-- | main.go | 110 | ||||
-rw-r--r-- | parser.go | 46 | ||||
-rw-r--r-- | templates.go | 78 |
5 files changed, 224 insertions, 26 deletions
@@ -1,2 +1,2 @@ .DS_Store -fbrss +fbfeed @@ -1,8 +1,18 @@ -# fbrss +# fbfeed Web scraper that presents a public Facebook group's post as an RSS -feed. +feed or HTML page. This is particular good for those of you who do not use Facebook, but still end up in social networks and local communities where gatherings and events are all organized using Facebook. + +## Usage + +Compile and run. Host it somewhere or locally. + +For HTML: `/nameOfGroup` + +For RSS: `/nameOfGroup.rss` + +`nameOfGroup` is the @-name in Facebook. @@ -3,42 +3,112 @@ package main import ( "fmt" "net/http" + "strings" "time" ) +var outputMode string + +type channel struct { + Title string + Link string + Description string + Items []*post +} + +func (c *channel) String() string { + var template string + switch outputMode { + case "html": + template = htmlRoot + case "rss": + template = rssRoot + } + + var s string + s = strings.Replace(template, "{{title}}", c.Title, -1) + s = strings.Replace(s, "{{link}}", c.Link, -1) + s = strings.Replace(s, "{{description}}", c.Description, -1) + var items string + for i := range c.Items { + item := c.Items[i].String() + item = strings.Replace(item, "{{link}}", c.Link, -1) + items += item + } + s = strings.Replace(s, "{{items}}", items, 1) + return s +} + type post struct { Time time.Time + Link string Content string } -func main() { - var groups []string - groups = append(groups, "") +func (p *post) String() string { + var template string + switch outputMode { + case "html": + template = htmlItem + case "rss": + template = rssItem + } + + var s string + // time format: Mon Jan 2 15:04:05 -0700 MST 2006 + s = strings.Replace(template, "{{time}}", p.Time.Format("Mon, 2 Jan 2006 15:04:05 MST"), 2) + s = strings.Replace(s, "{{content}}", p.Content, 1) + return s +} - for i := range groups { - url := "https://www.facebook.com/pg/" + groups[i] + "/posts/" +type handler struct{} - resp, err := http.Get(url) - if err != nil { - panic(err) - } - defer resp.Body.Close() +func (h handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + group := r.URL.Path[1:] + if strings.HasSuffix(group, ".rss") { + group = strings.TrimSuffix(group, ".rss") + outputMode = "rss" + } - ps, err := parse(resp.Body) - if err != nil { - panic(err) - } + c, err := fetch(group) + if err != nil { + http.Error(w, fmt.Sprintf("error: %s", err), 400) + return + } - fmt.Println(url) - fmt.Printf("%s", present(ps)) + if c == nil || len(c.Items) < 1 { + http.Error(w, fmt.Sprintf("%s", "group not found"), 400) + return } + + fmt.Fprintf(w, "%s\n", c.String()) +} + +func main() { + outputMode = "html" + fmt.Println("Serving: http://localhost:1212") + http.ListenAndServe(":1212", handler{}) } -func present(ps []post) (s string) { - for i := range ps { - s += ps[i].Time.String() + "\n" - s += ps[i].Content + "\n\n" +func fetch(group string) (c *channel, err error) { + if group == "" { + return } + url := "https://www.facebook.com/pg/" + group + "/posts/" + + resp, err := http.Get(url) + if err != nil { + return + } + defer resp.Body.Close() + + c, err = parse(resp.Body) + if err != nil { + return + } + + c.Link = url + return } @@ -11,7 +11,8 @@ import ( // parse takes an io.Reader which is supposed to be an entire web page (like http.Response.Body) // and returns the separate posts found inside. -func parse(r io.Reader) (ps []post, err error) { +func parse(r io.Reader) (c *channel, err error) { + c = &channel{} doc, err := html.Parse(r) if err != nil { return @@ -19,13 +20,31 @@ func parse(r io.Reader) (ps []post, err error) { var f func(*html.Node) f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "meta" { + var prop string + for _, a := range n.Attr { + if a.Key == "property" { + prop = a.Val + } + if a.Key == "content" { + switch prop { + case "og:title": + c.Title = a.Val + case "og:description": + c.Description = a.Val + } + } + } + + } + if n.Type == html.ElementNode && n.Data == "div" { for _, a := range n.Attr { //if a.Key == "class" && strings.Contains(a.Val, "text_exposed_root") { if a.Key == "class" && strings.Contains(a.Val, "userContentWrapper") { var p post parseNode(n, &p) - ps = append(ps, p) + c.Items = append(c.Items, &p) break } } @@ -43,6 +62,7 @@ func parse(r io.Reader) (ps []post, err error) { func parseNode(n *html.Node, p *post) { if n.Type == html.ElementNode { + // fetch time stamp from header if n.Data == "abbr" { for _, attr := range n.Attr { if attr.Key == "data-utime" { @@ -52,11 +72,31 @@ func parseNode(n *html.Node, p *post) { } } } + // skip post header with name of group and check-in + if n.Data == "h5" { + return + } + // skip "show more" and aria-hidden stuff + if n.Data == "span" { + for _, attr := range n.Attr { + if attr.Key == "class" && attr.Val == "text_exposed_hide" { + return + } + if attr.Key == "aria-hidden" && attr.Val == "true" { + return + } + } + } } + // fetch all pure text elements if n.Type == html.TextNode { + nl := "\n" // newlines + if n.Parent.Type == html.ElementNode && n.Parent.Data == "p" { + nl = "\n\n" + } if n.Data != "..." { - p.Content += n.Data + "\n" + p.Content += strings.Trim(n.Data, " ") + nl } } diff --git a/templates.go b/templates.go new file mode 100644 index 0000000..8f6453f --- /dev/null +++ b/templates.go @@ -0,0 +1,78 @@ +package main + +const rssRoot = ` +hejhej<?xml version="1.0" encoding="utf-8" standalone="yes"?> +<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"> + <channel> + <title>{{title}}</title> + <link>{{link}}</link> + <description>{{description}}</description> + <generator>fbfeed</generator> + {{items}} + </channel> +</rss> +` + +const rssItem = ` +<item> + <title>{{time}}</title> + <pubDate>{{time}}</pubDate> + <link>{{link}}</link> + <description>{{content}}</description> +</item> +` + +const htmlRoot = ` +<!DOCTYPE html> +<html> +<head> + <title>{{title}}· fbfeed</title> + <style type="text/css"> + body { + font-size: 14pt; + line-height: 1.2; + max-width: 1200px; + margin: 0 auto; + padding: 0.5em; + word-wrap: break-word; + } + article { + margin-bottom: 1em; + } + article datetime { + font-weight: bold; + } + article pre { + margin: 0; + padding-left: 1em; + font-family: inherit; + word-wrap: break-word; + white-space: pre-line; + } + </style> +</head> +<body> + <header> + <h1>{{title}}</h1> + <a href="{{link}}">{{link}}</a> + <p>{{description}}</p> + </header> + <hr> + <main> + {{items}} + </main> + <hr> + <footer> + <small>Generated by fbfeed</small> + </footer> +</body> + +</html> +` + +const htmlItem = ` +<article> +<datetime>{{time}}</datetime> +<pre>{{content}}</pre> +</article> +` |