-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParser.py
95 lines (66 loc) · 2.34 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from bs4 import BeautifulSoup
import io
# import requests
import re
import glob, os
import string
CURRENT_DIR = os.getcwd()
CORPUS_PATH = os.path.join(CURRENT_DIR, "cacm")
DOC_TOKENS_MAP = {}
TOKENIZED_CORPUS_PATH = os.path.join("TokenizedFile")
ps = string.punctuation
trans = str.maketrans(ps, " ")
def Tokenizer(filename):
with io.open(filename, "r", encoding="utf-8") as file:
lines = file.read()
body_content = parse_html_doc(lines)
# Tokenize using included function
tokens = tokenize(body_content)
# Using included function CASE FOLD
tokens = case_fold(tokens)
# Removing unnecessary punctuation
tokens = punctuation_handler(tokens)
# Saving tokens to file
save_tokens_to_file(tokens, filename)
def parse_html_doc(raw_html):
# Extracts main body content text from the raw html
soup = BeautifulSoup(raw_html, 'html.parser')
body = soup.find("pre").get_text()
match = re.search(r'\sAM|\sPM', body)
if match:
body = body[:match.end()]
return body
def tokenize(text_content):
# Converts text into a list of tokens without spaces.
raw_tokens = text_content.split()
regex = re.compile('\w')
return list(filter(regex.search, raw_tokens))
def case_fold(tokens):
# Returns case-folded list of tokens.
return [x.casefold() for x in tokens]
def punctuation_handler(tokens):
punct_removed = []
for token in tokens:
punct_removed.append(token.translate(trans).strip())
# Remove white-space tokens
regex = re.compile('\S')
return list(filter(regex.search, punct_removed))
def save_tokens_to_file(tokens, file):
filename = os.path.basename(file)
doc_id = filename[:-5]
output_file = os.path.join(TOKENIZED_CORPUS_PATH, doc_id + ".txt")
global DOC_TOKENS_MAP
DOC_TOKENS_MAP[doc_id] = tokens
with io.open(output_file, "w", encoding="utf-8") as tokenized_html:
for token in tokens:
tokenized_html.write(token + "\n")
def main():
# Read input CACM raw documents corpus
input_path = os.path.join(CORPUS_PATH, r"*.html")
files = glob.glob(input_path)
# Create output directory for tokenized files
os.makedirs(TOKENIZED_CORPUS_PATH, exist_ok=True)
for filename in files:
Tokenizer(filename)
print("Completed parsing documents.")
# main()