-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcommon_words.py
60 lines (51 loc) · 2.74 KB
/
common_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
import os, os.path
import nltk
from nltk.corpus import senseval
# get the senseval 3 data from system folder
path = os.path.relpath('nltk_data')
nltk.data.path[0]=path
items = senseval.fileids()
# the feature extractor code is below here -----------------------------------
vectorSize = 10 # number of words before and after to look for
cooccur_vect = {} # store the feature vector keys in here
# I copied pasted the english stopwords corpus from the nltk package here
# for convenience; however, there are also corpus specific stopwords here too
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',\
'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',\
'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',\
'itself', 'they', 'them', 'their', 'theirs', 'themselves',\
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',\
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',\
'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',\
'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',\
'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',\
'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',\
'off', 'over', 'under', 'again', 'further', 'then', 'once',\
'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',\
'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',\
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',\
'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now',\
'.', ',', '(', ')', '{', '}', '[', ']', '-', '?', '!', "n't", "'",\
'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'fig.', 'ref', '20', 'b1',\
"'ve", "'d", "'s", ":", ";", 'one', 'two', 'may', 'also', 'many', '%',\
'would', 'could']
# common words feature extractor
def common_words(item, pos, context, dictionary):
words = context[:pos] + context[pos+1:]
words = [x.lower() for x in words]
for x in stopwords:
words = filter(lambda w : w != x, words)
word_counts = {}
for word in words:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
words = sorted(word_counts, key = word_counts.get, reverse = True)
words = tuple(words[:vectorSize])
# a dictionary key should be "cooccurrence%word" and
# value should be a tuple of the most common words
dictionary[("%s*%s%d" % ("cooccurrence",item,pos))] = words
return dictionary