forked from shangjingbo1226/AutoPhrase
-
Notifications
You must be signed in to change notification settings - Fork 1
/
filter.py
45 lines (37 loc) · 1.19 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
def checkNum(word):
for s in word:
if(s in ['0','1','2','3','4','5','6','7','8','9']):
return True
return False
def checkStopWord(word):
if(word in ["and","the","for","of","via","from"]):
return True
return False
fWriter=open("results/filtered_phrases.txt","w")
with open("results/AutoPhrase_multi-words.txt","r") as f:
pattern="^([a-zA-Z]{2,}\s+[a-zA-Z]{2,}(?:\s+[a-zA-Z]{2,})*)$"
for line in f.readlines():
line=line.strip()
splitted=line.split("\t")
score=float(splitted[0])
phrase_part=splitted[1]
if(score<0.5):
continue
phrase_words=phrase_part.split(" ")
if(re.match(pattern, phrase_part) is None):
continue
if(len(phrase_words)>4):
continue
flag=[len(word)<=2 for word in phrase_words]
if(sum(flag)>=1):
continue
ifNum=[checkNum(word) for word in phrase_words]
if(sum(ifNum)>=1):
continue
ifStopwords=[checkStopWord(word) for word in phrase_words]
if(sum(ifStopwords)>=1):
continue
#print(line)
fWriter.write("%s\n" % phrase_part)
fWriter.close()