From 449f35d4fd50cae9b4c1fe4dd4b7003fd4e7b8e6 Mon Sep 17 00:00:00 2001 From: Petter Rodhelind Date: Mon, 22 Feb 2021 00:42:08 +0100 Subject: Parse links and get better at hiding stuff. --- parser.go | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/parser.go b/parser.go index 4d3720b..bf9e837 100644 --- a/parser.go +++ b/parser.go @@ -2,6 +2,7 @@ package main import ( "io" + "net/url" "strconv" "strings" "time" @@ -76,14 +77,20 @@ func parseNode(n *html.Node, p *post) { case "h5": return - // skip "show more" and aria-hidden stuff + // skip "show more", aria-hidden and accessible stuff case "span": fallthrough case "a": + fallthrough + case "div": for _, attr := range n.Attr { if attr.Key == "class" && attr.Val == "text_exposed_hide" { return } + if attr.Key == "class" && strings.Contains(attr.Val, "accessible_elem") { + return + } + if attr.Key == "aria-hidden" && attr.Val == "true" { return } @@ -104,7 +111,11 @@ func parseNode(n *html.Node, p *post) { // fetch all pure text elements outside proper post paragraphs if n.Type == html.TextNode { - p.Content += n.Data + "\n" + p.Content += n.Data + if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" { + parseLink(n.Parent, p) + } + p.Content += "\n" } // loop deeper @@ -117,6 +128,16 @@ func parseParagraph(n *html.Node, p *post) { // fetch all pure text elements if n.Type == html.TextNode { p.Content += n.Data + if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" { + parseLink(n.Parent, p) + return + } + } + + // normal linebreak within paragraphs + if n.Type == html.ElementNode && n.Data == "br" { + p.Content += "\n" + return } // loop deeper @@ -141,3 +162,23 @@ func parseImage(n *html.Node, p *post) { } p.Images = append(p.Images, img) } + +func parseLink(n *html.Node, p *post) { + for _, attr := range n.Attr { + switch attr.Key { + case "href": + urlraw, _ := url.PathUnescape(attr.Val) + url, _ := url.Parse(urlraw) + external := url.Query().Get("u") + if external != "" { + p.Content += " (" + external + ")" + } else { + base := strings.Split(urlraw, "?")[0] + if base[0] == '/' { + base = "https://www.facebook.com" + base + } + p.Content += " (" + base + ")" + } + } + } +} -- cgit v1.2.3