-
Notifications
You must be signed in to change notification settings - Fork 0
/
token_count.py
100 lines (81 loc) · 3.55 KB
/
token_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import sys
import argparse
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def read_file(args):
# print("Reading file")
word_dict = {}
# print(args)
if args.stem:
nltk.download('punkt')
stemmer = PorterStemmer()
if args.word:
nltk.download('stopwords')
with open(args.filename, 'r') as myfile:
for line in myfile:
if args.lower:
# print(line)
line = line.lower()
# if args.stem:
# line = word_tokenize(line)
# print("Begin word stuff")
for word in line.split():
# print(word)
if args.number:
word = ''.join([i for i in word if i.isalpha()])
# print("4U".isalpha())
if args.word:
if word not in stopwords.words('english'):
if args.stem:
word_tokenize(word)
stemmer.stem(word)
word_dict.setdefault(word, 0)
word_dict[word] += 1
else:
if args.stem:
word_tokenize(word)
stemmer.stem(word)
word_dict.setdefault(word, 0)
word_dict[word] += 1
# print(word_dict)
return word_dict
def main():
# if len(sys.argv) < 3:
# exit("Please input text file and at least one preprocessing command.")
#### ARGLINE PARSER ####
parser = argparse.ArgumentParser(
prog='normalize_text',
description='This program will take in a txt file and list of desired preproccessing commands and output tokens.',
epilog='Please give at least one of the following preprocessing procedures: lowercasing (--lower or -l), stemming (--stem or -s), stopword removal (--word or -w), or number and symbol removal (--number or -n)'
)
parser.add_argument('filename')
parser.add_argument('-l', '--lower', action='store_true')
parser.add_argument('-s', '--stem', action='store_true')
parser.add_argument('-w', '--word', action='store_true')
parser.add_argument('-n', '--number', action='store_true')
args = parser.parse_args()
# if args.filename == '':
# exit("Please input text file and at least one preprocessing command.")
# print(args.filename, args.lower, args.stem, args.word)
#### READ AND THEN SORT FILE ####
sorted_dict = sorted(read_file(args).items(), key=lambda x:x[1], reverse=True)
# print(sorted_dict)
#### PRINT RESULTS ####
print_list = []
for token, count in sorted_dict:
print(token, count)
print_list.append(token + " " + str(count))
# Results.txt has the exact copy of the console print
with open("results.txt", 'w') as results:
results.writelines([word + "\n" for word in print_list])
# histogram_counts.txt has the numbers only copy of the console print for easy graphing
count_list = []
with open("histogram_counts.txt", "w") as excel:
for token, count in sorted_dict:
# print(token, count)
count_list.append(str(count))
excel.writelines([num + "\n" for num in count_list])
if __name__ == "__main__":
main()