-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzotero-extract-cache-files.R
95 lines (82 loc) · 3.25 KB
/
zotero-extract-cache-files.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
### Export CSV file (no text, no notes)
### Load csv
#install.packages("readr", "qdapRegex", "tm", "stringr", "stringi", "qdap", "qdapRegex", "tibble", "mallet", "dplyr", "tidytext")
library(readr)
library(qdapRegex)
library(tm)
library(stringr)
library(stringi)
library(qdap)
library(qdapRegex)
library(tibble)
library(mallet)
library(dplyr)
library(tidytext)
mypath <- file.choose()
mydata <- read_csv(mypath)
shortdata <- mydata[,c("Key","Publication Year","Author","Title","Abstract Note","File Attachments")]
#filter out the records without attachments
shortdata <- shortdata[!(is.na(shortdata$`File Attachments`) | shortdata$`File Attachments`==" "), ]
#coerce all characters into ASCII
shortdata <- as.data.frame(apply(shortdata, c(1, 2), stri_trans_general, "Latin-ASCII"), stringsAsFactors=FALSE)
## loading text
#texts=vector()
id = vector()
lda.format <- vector()
terms <-list()
for (i in 1:nrow(shortdata)) {
path = shortdata['File Attachments'][[1]][i]
dotpath = strsplit(path, "/") # REPLACE by \ for PC - Let's hope there is not title with this symbol
paths = unlist(dotpath)[length(unlist(dotpath))]
uris.name = gsub(paths,".zotero-ft-cache", path, perl = FALSE, fixed = TRUE) # replace pdf file name by cache file
if (file.exists(uris.name)) {
text.scan <- scan(uris.name, what="character", sep="\n",blank.lines.skip = FALSE)
data=enc2utf8(text.scan)
loc.a <- grep("References|REFERENCES", data) # removing last section with references from text
if (length(data[loc.a])>1){
text <- data[1:(loc.a[length(loc.a)]-1)]
}
else if (length(data[loc.a])=1) {
text <- data[1:(loc.a-1)]
}
else {
text <- data
}
text <- paste(text, collapse = " ")
text <-gsub("-\\s+", "", text)
text.punct <- rm_email(text)
text.punct <- rm_url(text.punct)
text.punct <- rm_twitter_url(text.punct)
text.punct <-rm_citation(text.punct)
text.punct <-rm_citation(text.punct, pattern="@rm_citation3")
text.punct <-rm_citation(text.punct, pattern="@rm_citation2")
text.punct <-rm_round(text.punct)
text.punct <-rm_curly(text.punct)
text.punct <-rm_square(text.punct)
text.punct <- rm_bracket(text.punct)
text.punct <- strip(text.punct, char.keep="-",digit.remove = TRUE, apostrophe.remove = FALSE,
lower.case = TRUE)
text.punct <- removePunctuation(text.punct,
preserve_intra_word_contractions = TRUE,
preserve_intra_word_dashes = TRUE)
text.punct <- removeWords(text.punct, stopwords("en"))
text.punct <- gsub("\\s\\s+"," ",text.punct)
lda.format[i] <- text.punct
id[i] <- i
d <- data_frame(txt = text.punct)
freq <- d %>% unnest_tokens(word, txt) %>%
count(word) %>%
ungroup()
terms[[i]] <- freq
}
else { # cache file does not exist
lda.format[i] <- 'NA'
terms[[i]] <-'NA'
}
titles <- mydata$Title
abstracts <- mydata$`Abstract Note`
authors <- mydata$Author
datetimes <- mydata$`Publication Year`
collection <- list(id = id, titles=titles, abstracts=abstracts, authors=authors,datetimes=datetimes,
terms=terms, text = lda.format)
save_collection <- saveRDS(collection, "collection.rds")