-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram_example.py
64 lines (56 loc) · 2.22 KB
/
ngram_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
import preprocessing as pp
with open("tokenized_data.txt", "r", encoding="UTF-8") as f:
lines = [pp.apply_preprocess(line.strip(), deascify=False) for line in f.readlines()]
frequencies_unigram = {}
frequencies_bigram = {}
frequencies_trigram = {}
frequencies_fourgram = {}
for text in lines:
if text.strip() == "" or len(text.strip()) < 4:
continue
else:
token = nltk.word_tokenize(text)
for tk in token:
if tk in frequencies_unigram:
frequencies_unigram[tk] += 1
else:
frequencies_unigram[tk] = 1
bigrams = ngrams(token, 2)
for tk in bigrams:
if tk in frequencies_bigram:
frequencies_bigram[tk] += 1
else:
frequencies_bigram[tk] = 1
trigrams = ngrams(token, 3)
for tk in trigrams:
if tk in frequencies_trigram:
frequencies_trigram[tk] += 1
else:
frequencies_trigram[tk] = 1
fourgrams = ngrams(token, 4)
for tk in fourgrams:
if tk in frequencies_fourgram:
frequencies_fourgram[tk] += 1
else:
frequencies_fourgram[tk] = 1
topn_count = 10
topn_unigrams = sorted(frequencies_unigram.items(), key=lambda x: x[1], reverse=True)[0:topn_count]
topn_bigrams = sorted(frequencies_bigram.items(), key=lambda x: x[1], reverse=True)[0:topn_count]
topn_trigrams = sorted(frequencies_trigram.items(), key=lambda x: x[1], reverse=True)[0:topn_count]
topn_fourgrams = sorted(frequencies_fourgram.items(), key=lambda x: x[1], reverse=True)[0:topn_count]
def save_topn_ngrams(ngram_list, file_name, n=-1):
text_list = []
for item in ngram_list:
row = str(item[-1]) + "," + (item[0] if n == 1 else " ".join(item[0]))
text_list.append(row)
with open(f"{file_name}.txt", "w", encoding="utf-8") as f:
f.write("\n".join(text_list))
save_topn_ngrams(topn_unigrams,"top_unigrams", n=1)
save_topn_ngrams(topn_bigrams,"top_bigrams")
save_topn_ngrams(topn_trigrams,"top_trigrams")
save_topn_ngrams(topn_fourgrams,"top_fourgrams")