From 449f35d4fd50cae9b4c1fe4dd4b7003fd4e7b8e6 Mon Sep 17 00:00:00 2001
From: Petter Rodhelind <petter.rodhelind@gmail.com>
Date: Mon, 22 Feb 2021 00:42:08 +0100
Subject: Parse links and get better at hiding stuff.

---
 parser.go | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/parser.go b/parser.go
index 4d3720b..bf9e837 100644
--- a/parser.go
+++ b/parser.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"io"
+	"net/url"
 	"strconv"
 	"strings"
 	"time"
@@ -76,14 +77,20 @@ func parseNode(n *html.Node, p *post) {
 		case "h5":
 			return
 
-		// skip "show more" and aria-hidden stuff
+		// skip "show more", aria-hidden and accessible stuff
 		case "span":
 			fallthrough
 		case "a":
+			fallthrough
+		case "div":
 			for _, attr := range n.Attr {
 				if attr.Key == "class" && attr.Val == "text_exposed_hide" {
 					return
 				}
+				if attr.Key == "class" && strings.Contains(attr.Val, "accessible_elem") {
+					return
+				}
+
 				if attr.Key == "aria-hidden" && attr.Val == "true" {
 					return
 				}
@@ -104,7 +111,11 @@ func parseNode(n *html.Node, p *post) {
 
 	// fetch all pure text elements outside proper post paragraphs
 	if n.Type == html.TextNode {
-		p.Content += n.Data + "\n"
+		p.Content += n.Data
+		if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" {
+			parseLink(n.Parent, p)
+		}
+		p.Content += "\n"
 	}
 
 	// loop deeper
@@ -117,6 +128,16 @@ func parseParagraph(n *html.Node, p *post) {
 	// fetch all pure text elements
 	if n.Type == html.TextNode {
 		p.Content += n.Data
+		if n.Parent.Type == html.ElementNode && n.Parent.Data == "a" {
+			parseLink(n.Parent, p)
+			return
+		}
+	}
+
+	// normal linebreak within paragraphs
+	if n.Type == html.ElementNode && n.Data == "br" {
+		p.Content += "\n"
+		return
 	}
 
 	// loop deeper
@@ -141,3 +162,23 @@ func parseImage(n *html.Node, p *post) {
 	}
 	p.Images = append(p.Images, img)
 }
+
+func parseLink(n *html.Node, p *post) {
+	for _, attr := range n.Attr {
+		switch attr.Key {
+		case "href":
+			urlraw, _ := url.PathUnescape(attr.Val)
+			url, _ := url.Parse(urlraw)
+			external := url.Query().Get("u")
+			if external != "" {
+				p.Content += " (" + external + ")"
+			} else {
+				base := strings.Split(urlraw, "?")[0]
+				if base[0] == '/' {
+					base = "https://www.facebook.com" + base
+				}
+				p.Content += " (" + base + ")"
+			}
+		}
+	}
+}
-- 
cgit v1.2.3