-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathchunker.py
executable file
·81 lines (70 loc) · 2.55 KB
/
chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import nltk.chunk
import nltk
from nltk.corpus import conll2000
import itertools
from nltk import pos_tag, word_tokenize
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse2(self, tokens):
# split words and part of speech tags
if len(tokens) > 0:
(words, tags) = zip(*tokens)
# get IOB chunk tags
chunks = self.tagger.tag(tags)
# join words with chunk tags
wtc = itertools.izip(words, chunks)
# w = word, t = part-of-speech tag, c = chunk tag
lines = [' '.join([w, t, c]) for (w, (t, c)) in wtc if c]
# create tree from conll formatted chunk lines
else:
lines = []
return nltk.chunk.conllstr2tree('\n'.join(lines))
def clean_punctuation(text):
text = text.replace("."," .")
text = text.replace(","," ,")
text = text.replace("!"," !")
text = text.replace(":"," :")
text = text.replace("\""," \"")
return text
def run(q_id):
train_sents = conll2000.chunked_sents('train.txt')
unigram_chunker = UnigramChunker(train_sents)
import init
#get document here and tag; put into this format:
#tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN"),(".", ".")]
topdoc = init.get_corpus(q_id)
doc_nums = topdoc.keys()
answers= [];
for key in doc_nums:
doc_text = topdoc[key]
docnum= key
#print docnum
doc_text = clean_punctuation(doc_text)
#print doc_text
doc_text= doc_text.split()
tagged=pos_tag(doc_text)
chunked=unigram_chunker.parse2(tagged)
flatten= chunked.pos()
#print flatten
numbered= enumerate(flatten)
currentTag=''
words=[]
for i,v in numbered:
#print i,v
((word,tag),phrasetag)=v
if currentTag=='':
currentTag=phrasetag
if currentTag==phrasetag:
words.append(word)
else:
answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
currentTag= phrasetag
words= [word]
answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
#print answers
return answers
if __name__=="__main__":
print run(213)