-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweet_fetcher.go
172 lines (150 loc) · 4.86 KB
/
tweet_fetcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
package urlresolver
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"golang.org/x/net/html"
"github.com/mccutchen/urlresolver/bufferpool"
)
// tweetFetcher fetches tweets
type tweetFetcher interface {
Fetch(ctx context.Context, tweetURL string) (tweetData, error)
}
// tweetData is a minimal representation of a tweet's data
type tweetData struct {
URL string
Text string
}
// oembedTweetFetcher knows how to fetch information about a tweet from Twitter's
// oembed endpoint.
type oembedTweetFetcher struct {
baseURL string
httpClient *http.Client
pool *bufferpool.BufferPool
}
// newTweetFetcher creates a new oembedTweetFetcher
func newTweetFetcher(transport http.RoundTripper, timeout time.Duration, pool *bufferpool.BufferPool) *oembedTweetFetcher {
return &oembedTweetFetcher{
baseURL: "https://publish.twitter.com/oembed",
httpClient: &http.Client{
Transport: transport,
Timeout: timeout,
},
pool: pool,
}
}
// Fetch returns the title and resolved URL for a tweet by fetching its
// metadata from Twitter's oembed endpoint.
func (f *oembedTweetFetcher) Fetch(ctx context.Context, tweetURL string) (tweetData, error) {
params := url.Values{
"url": []string{tweetURL},
}
oembedURL := fmt.Sprintf("%s?%s", f.baseURL, params.Encode())
req, _ := http.NewRequestWithContext(ctx, "GET", oembedURL, nil)
resp, err := f.httpClient.Do(req)
if err != nil {
return tweetData{}, err
}
if resp.StatusCode != http.StatusOK {
return tweetData{}, fmt.Errorf("twitter oembed error: GET %s: HTTP %d", oembedURL, resp.StatusCode)
}
buf := f.pool.Get()
defer f.pool.Put(buf)
if _, err := io.Copy(buf, resp.Body); err != nil {
return tweetData{}, fmt.Errorf("error reading twitter oembed response: %w", err)
}
var oembedResult struct {
AuthorName string `json:"author_name"`
HTML string `json:"html"`
URL string `json:"url"`
}
if err := json.Unmarshal(buf.Bytes(), &oembedResult); err != nil {
return tweetData{}, fmt.Errorf("invalid json in twitter oembed response: %w", err)
}
if oembedResult.URL == "" || oembedResult.HTML == "" {
return tweetData{}, fmt.Errorf("unexpected json format in twitter oembed response: %q", buf.String())
}
return tweetData{
URL: oembedResult.URL,
Text: extractTweetText(oembedResult.HTML),
}, nil
}
// https://regex101.com/r/EBKewP/1
var tweetRegex = regexp.MustCompile(`(?i)^https://(mobile\.)?twitter\.com/([^/]+/status/\d+|i/web/status/\d+)`)
// matchTweetURL matches URLs pointing to tweets. If matched, returns the URL
// to the tweet after removing extra data (extra media paths, query params,
// etc).
func matchTweetURL(s string) (string, bool) {
match := tweetRegex.FindString(s)
if match != "" {
// Twitter's oembed endpoint does not work with these odd
// /i/web/status/XXX URLs that contain a tweet ID without a username,
// but it does seem to work if you construct a fake tweet URL with that
// same tweet ID.
//
// So, for those URLs, we pretend the tweet has a username of
// __urlresolver__.
if strings.Contains(s, "/i/web/") {
return strings.ReplaceAll(match, "/i/web/", "/__urlresolver__/"), true
}
return match, true
}
return "", false
}
var tcoRegex = regexp.MustCompile(`(?i)^https?://t\.co/.+`)
// matchTweetURL matches URLs pointing to tweets. If matched, returns the URL
// to the tweet after removing extra data (extra media paths, query params,
// etc).
func matchTcoURL(s string) bool {
return tcoRegex.FindString(s) != ""
}
// extractTweetText extracts the text content of a tweet from its html form in
// the twitter oembed response.
//
// At a high level, this function 1) captures only the text within the first
// <p> element in s, 2) replaces all html tags within that <p> with spaces 3)
// normalizes whitespace.
//
// The goal is not perfect fidelity to the original tweet, but something useful
// as the sanitized "title" for a tweet URL.
func extractTweetText(s string) string {
tokenizer := html.NewTokenizer(strings.NewReader(s))
var buf strings.Builder
// Flag that indicates whether we have found the opening <p> and should
// accumulate text into our buffer.
captureText := false
outerLoop:
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
break outerLoop
case html.StartTagToken:
tag := tokenizer.Token().Data
if tag == "p" {
captureText = true
} else if captureText {
// all tags within the <p> are replaced with whitespace, which
// will be normalized later
buf.WriteString(" ")
}
case html.TextToken:
if captureText {
buf.WriteString(tokenizer.Token().Data)
}
case html.EndTagToken:
if tokenizer.Token().Data == "p" {
break outerLoop
}
}
}
// Normalize runs of whitespace and newlines by splitting the buffered
// string into fields and re-joining each with a single space.
return strings.Join(strings.Fields(buf.String()), " ")
}