-
Notifications
You must be signed in to change notification settings - Fork 21
/
utils.py
91 lines (72 loc) · 3.21 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
# functions that aren't necessary, but useful when working w/ nltk-mxpost.py
from __future__ import division # always use "real" division (3 / 2 is 1.5 not 1)
import nltk, re, pprint
import cPickle as pickle
import time
import numpy
"""
General usage examples
----------------------
Using the BracketParseCorpusReader (see below) with files instead of
dictionaries is painfully SLOW. Therefore we have converted those files
into lists/dictiories and stored them in pickles/shelves for convenient reuse.
Files:
wsj_tagged_sents.pickle: wsj_tagged_sents (list) contains all 49208 tagged
sentences from the WSJ corpus. generated with generate_tagged_sents_list()
wsj_tagged_sents.shelve: dito, but as a shelve (less memory consumption)
wsj_word_freqdist.pickle: wsj_word_freqdist is a FrequencyDistribution object
containing 49817 words and their frequencies in the WSJ corpus
wsj_tag_freqdist.pickle: wsj_tag_freqdist is a FreqDist object containing all
tags from the WSJ corpus
loading pickle files:
object_name = load_from_pickle(object_name, file_name)
"""
def generate_tagged_sents_list():
"""generate a list of all tagged sentence that we can store with Pickle / use from memory
input: none, output: wsj_tagged_sents (list)."""
wsj_tagged_sents = []
for tagged_sent in ptb.tagged_sents():
wsj_tagged_sents.append(tagged_sent)
return wsj_tagged_sents
def generate_tagged_words_list():
"""generate a list of all tagged words. each list item is a tuple of the
form (word, tag)"""
wsj_tagged_words = []
for tagged_word in ptb.tagged_words():
wsj_tagged_words.append(tagged_word)
return wsj_tagged_words
def generate_freqdists():
"""generate frequency distributions for words and tags with BracketParseCorpusReader's tagged_words() method.
input: none, output: word_fredist, tag_freqdist."""
word_freqdist = nltk.FreqDist(word for (word, tag) in ptb.tagged_words())
tag_freqdist = nltk.FreqDist(tag for (word, tag) in ptb.tagged_words())
return word_freqdist, tag_freqdist
def generate_freqdists_from_tagged_sents(tagged_sents):
"""generate frequency distributions from a list of tagged sents.
input: tagged sentences (list). output: freqdist of words
as nested list comprehension:
word_freqdist = nltk.FreqDist(word
for tagged_sent in tagged_sents
for (word, tag) in tagged_sent)
"""
word_freqdist = nltk.FreqDist()
for tagged_sent in tagged_sents:
for (word, tag) in tagged_sent:
word_freqdist.inc(word)
return word_freqdist
def save_to_pickle(object_name, file_name):
"""saves an object to a pickle file
input: object name, pickle file name (as STRING)"""
pickle_file = open(file_name, "w")
pickle.dump(object_name, pickle_file)
pickle_file.close()
def load_from_pickle(object_name, file_name):
"""loads an object from a pickle file into memory
input: object name as STRING (as stored in pickle file), pickle file name (as STRING).
output: object.
"""
pickle_file = open(file_name, "r")
object_name = pickle.load(pickle_file)
pickle_file.close()
return object_name