-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.go
114 lines (98 loc) · 2.62 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package goseo
import (
"bytes"
"errors"
"io"
"strings"
strip "github.com/grokify/html-strip-tags-go"
"golang.org/x/net/html"
)
type Parser struct{}
type ParserChecks interface {
GetAllLinkHrefs()
GetFirstElement()
GetAllElements()
}
func (p Parser) GetAllLinkHrefs(rawHtml string) []string {
var formattedElements []string
doc, _ := html.Parse(strings.NewReader(rawHtml))
// recursively parse the html until we find a H1
var crawler func(*html.Node)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == "a" {
for _, a := range node.Attr {
if a.Key == "href" && (strings.HasPrefix(a.Val, "/") || strings.HasPrefix(a.Val, "http")) {
formattedElements = append(formattedElements, a.Val)
break
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return formattedElements
}
func (p Parser) GetFirstElement(rawHtml string, element string, raw bool) (string, error) {
var foundElement *html.Node
doc, _ := html.Parse(strings.NewReader(rawHtml))
// recursively parse the html until we find a H1
var crawler func(*html.Node)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == element {
foundElement = node
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
if foundElement != nil {
formatted := renderNode(foundElement)
if !raw {
formatted = stripHtml(formatted)
}
return formatted, nil
}
return "", errors.New("missing <" + element + "> in the node tree")
}
func (p Parser) GetAllElements(rawHtml string, element string, raw bool) []string {
var foundElements []*html.Node
var formattedElements []string
doc, _ := html.Parse(strings.NewReader(rawHtml))
// recursively parse the html until we find a H1
var crawler func(*html.Node)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == element {
foundElements = append(foundElements, node)
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
if len(foundElements) > 0 {
for _, v := range foundElements {
formatted := renderNode(v)
if !raw {
formatted = stripHtml(formatted)
}
formattedElements = append(formattedElements, formatted)
}
}
return formattedElements
}
func renderNode(n *html.Node) string {
var buf bytes.Buffer
w := io.Writer(&buf)
html.Render(w, n)
return buf.String()
}
func stripHtml(h1 string) string {
stripped := strip.StripTags(h1)
trimmed := strings.TrimSpace(stripped)
return trimmed
}