-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathrepool_util.py
75 lines (60 loc) · 2.06 KB
/
repool_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
""" Functions: useful general utils """
import cPickle
import re
from os import startfile
def savePubs(filename, pubs_to_save):
"""
save a list of publications into a file using Python's pickle
filename: string
pubs_to_save: List of Publication objects
returns nothing
"""
file = open(filename, 'w')
cPickle.dump(pubs_to_save, file)
file.close()
def loadPubs(filename):
"""
retrieve a saved list of publications
filename: string
returns list of dictionaries, each representing a Publication
"""
unpicklefile = open(filename, 'r')
pubs = cPickle.load(unpicklefile)
unpicklefile.close()
return pubs
def openPDFs(pdf_lst):
"""
uses an os call to open a list of pdfs
pdf_lst: list of strings: paths (or urls) of pdfs to open
"""
if len(pdf_lst)>10:
print "more than 10? that can't be right. Request denied."
return
for x in pdf_lst:
startfile(x)
def stringToWordDictionary(str):
"""
Takes a string and returns dictionary that stores frequency of every word.
Some stop words are removed.
str: string
returns dictionary of word counts for each word. Example: d['hello'] -> 5
"""
str = str.lower() #convert to lower case
m = re.findall('[a-zA-Z\-]+', str)
m = [x for x in m if len(x) > 2] #filter out small words
# count number of occurences of each word in dict and return it
d = {}
for i in m: d[i] = d.get(i,0) + 1
# remove stopwords
stopwords = ['the', 'and', 'for', 'that', 'can', 'this', 'which', \
'where', 'are', 'from', 'our', 'not', 'with', 'use', \
'then', 'than', 'but', 'have', 'was', 'were', 'these', \
'each', 'used', 'set', 'such', 'using', 'when', 'those' \
'may', 'also']
#cid is some kind of artifact from the pdf conversion that occurs very often
stopwords.extend(['cid'])
keys = d.keys()
for k in keys:
if k in stopwords:
del d[k]
return d