-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean2_tag-EN.py
52 lines (42 loc) · 1.39 KB
/
clean2_tag-EN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# coding:utf-8
import re
import pprint
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError
import sys
import time
start_time = time.time()
# NOTE: (2016.6.1)
# This is named as "clean2_tag-jp" in order for consistance with
# the text processing of English version. But in fact there is
# no tag-cleaning in this script instead it remove the "stopword"
# according to the stop-word lists defined here.
#ascii error coding, change coding to UTF-8
reload(sys)
sys.setdefaultencoding('utf8')
wnl = WordNetLemmatizer()
#Sample File:
input_filename="sample_cleaned_tag_en.txt"
outpu_filename="sample_cleaned2_tag_en.txt"
# # Real File:
# input_filename="cleaned_tag_en.txt"
# outpu_filename="cleaned2_tag_en.txt"
reg=[]
reg.append(r"[ ]'s[ ]*") #When to use r'' When to use u''?
reg.append(r"[ ]'[ ]*")
reg.append(r'[ ]�[ ]*')
reg.append(r"[ ]'re[ ]*")
output=open(outpu_filename,'w')
with open(input_filename) as data_file:
for (index,line) in enumerate(data_file):
if index in range(5000,60001,5000):
print "Now start the line No.:"+str(index)
print("--- %s seconds ---" % (time.time() - start_time))
#newData=line
#This must be run 1st! The order should not be changed!
for reg1 in reg:
line=re.sub(reg1,' ',line)
output.write(line)
output.close()
print("--- %s seconds ---" % (time.time() - start_time))