generated from mrz1836/go-template
-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextractor.go
144 lines (123 loc) · 3.29 KB
/
extractor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// Package metaextractor will extract the title, description, OG & meta tags from HTML
//
// If you have any suggestions or comments, please feel free to open an issue on
// this GitHub repository!
//
// By MrZ (https://github.com/mrz1836)
package metaextractor
import (
"io"
"golang.org/x/net/html"
)
// Extract is the method used to extract HTML tags
func Extract(resp io.Reader) (tags Tags) {
// Tokenize the response
z := html.NewTokenizer(resp)
// Set the values
var value string
var ok bool
titleFound := false
// Loop elements
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
return
case html.StartTagToken, html.SelfClosingTagToken:
t := z.Token()
if t.Data == TagBody {
return
}
if t.Data == TagTitle {
titleFound = true
}
if t.Data == TagMeta {
if value, ok = extractMetaProperty(t, TagMetaDescription); ok {
tags.Description = value
}
if value, ok = extractMetaProperty(t, TagMetaAuthor); ok {
tags.Author = value
}
if value, ok = extractMetaProperty(t, TagOGTitle); ok {
tags.OGTitle = value
if len(tags.Title) == 0 {
tags.Title = value
}
}
if value, ok = extractMetaProperty(t, TagOGDescription); ok {
tags.OGDescription = value
if len(tags.Description) == 0 {
tags.Description = value
}
}
if value, ok = extractMetaProperty(t, TagOGImage); ok {
tags.OGImage = value
}
if value, ok = extractMetaProperty(t, TagOGSiteName); ok {
tags.OGSiteName = value
}
if value, ok = extractMetaProperty(t, TagOGPublisher); ok {
tags.OGPublisher = value
}
if value, ok = extractMetaProperty(t, TagOGAuthor); ok {
tags.OGAuthor = value
if len(tags.Author) == 0 {
tags.Author = value
}
}
// Twitter card (use if OG not found)
if value, ok = extractMetaProperty(t, TagTwitterTitle); ok {
tags.TwitterTitle = value
if len(tags.Title) == 0 {
tags.Title = value
}
}
if value, ok = extractMetaProperty(t, TagTwitterDescription); ok {
tags.TwitterDescription = value
if len(tags.Description) == 0 {
tags.Description = value
}
}
if value, ok = extractMetaProperty(t, TagTwitterImage); ok {
tags.TwitterImage = value
if len(tags.OGImage) == 0 {
tags.OGImage = value
}
}
if value, ok = extractMetaProperty(t, TagTwitterCard); ok {
tags.TwitterCard = value
}
if value, ok = extractMetaProperty(t, TagTwitterPlayer); ok {
tags.TwitterPlayer = value
}
if value, ok = extractMetaProperty(t, TagTwitterPlayerWidth); ok {
tags.TwitterPlayerWidth = value
}
if value, ok = extractMetaProperty(t, TagTwitterPlayerHeight); ok {
tags.TwitterPlayerHeight = value
}
}
case html.TextToken:
if titleFound {
t := z.Token()
tags.Title = t.Data
titleFound = false
}
case html.CommentToken, html.DoctypeToken, html.EndTagToken:
continue
}
}
}
// extractMetaProperty will extract meta properties from HTML
func extractMetaProperty(t html.Token, prop string) (content string, ok bool) {
for _, attr := range t.Attr {
if (attr.Key == TagProperty && attr.Val == prop) ||
(attr.Key == TagName && attr.Val == prop) {
ok = true
}
if attr.Key == TagContent {
content = attr.Val
}
}
return
}