-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy path01-features.R
88 lines (70 loc) · 2.26 KB
/
01-features.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
## install tidyverse if not already
if (!requireNamespace("tidyverse", quietly = TRUE)) {
install.packages("tidyverse")
}
## install remotes if not already
if (!requireNamespace("remotes", quietly = TRUE)) {
install.packages("remotes")
}
## install rvest if not already
if (!requireNamespace("rvest", quietly = TRUE)) {
install.packages("rvest")
}
## install/upgrade rtweet
if (!requireNamespace("textfeatures", quietly = TRUE) ||
packageVersion("textfeatures") < "0.1.4") {
remotes::install_github("mkearney/textfeatures")
}
## install/upgrade textfeatures
if (!requireNamespace("rtweet", quietly = TRUE) ||
packageVersion("rtweet") < "0.6.7.9000") {
remotes::install_github("mkearney/rtweet")
}
## load tidyverse set of packages
## note: instead of loading rtweet, textfeatures, & rvest (and xml2) packages,
## the code below uses fully qualified namespaces (double colon).
library(tidyverse)
## download Twitter profiles via CSPAN's cabinet list
cab_twits <- rtweet::lists_members(
owner_user = "CSPAN", slug = "the-cabinet")
## get up to 3200 of most recent tweets for each
cab_tweets <- cab_twits %>%
filter(screen_name != "realDonaldTrump") %>%
pull(user_id) %>%
map(rtweet::get_timeline, n = 3200) %>%
bind_rows()
## scrape source code for op-ed
nyt <- xml2::read_html(
"https://www.nytimes.com/2018/09/05/opinion/trump-white-house-anonymous-resistance.html")
## return just the paragraph text
nyt_text <- nyt %>%
rvest::html_nodes("p") %>%
rvest::html_text() %>%
.[3:31]
## create data set with just author (id) and text
data <- data_frame(
id = c(cab_tweets$screen_name, rep("op-ed", length(nyt_text))),
text = c(cab_tweets$text, nyt_text)
)
## feature extraction
tf <- textfeatures::textfeatures(data, word_dims = 80, threads = 20)
## summarise by id
tfsum <- tf %>%
group_by(id) %>%
summarise_all(mean, na.rm = TRUE) %>%
ungroup()
## vector of unique authors
authors <- unique(tfsum$id)
## create numeric vectors of equal length for each author
cols <- map(authors,
~ filter(tfsum, id == .x) %>% select(-id) %>% as.list() %>% unlist()
)
## create matrix
mat <- cols %>%
unlist() %>%
as.numeric() %>%
matrix(nrow = length(authors), byrow = TRUE)
## set row and column names
row.names(mat) <- authors
## dipslay matrix
cor(t(mat))[, "op-ed"] %>% sort()