-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathrecursive_cut.py
52 lines (45 loc) · 1.47 KB
/
recursive_cut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# coding=UTF-8
#全部切成三字及以下
import jieba
def recursive_cut(line):
result = []
for big_word in jieba.lcut(line,HMM=False):
subword_list = get_subword_list(big_word)
if isinstance(subword_list, list):
go_subword_list(subword_list,result)
elif isinstance(subword_list, str):
result.append(subword_list)
else:
print("error")
return result
def isEN(uchar):
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def isZH(char):
if not ('\u4e00' <= char <= '\u9fa5'):
return False
return True
def get_subword_list(big_word):
if not isZH(big_word[0]):
return big_word
if len(big_word)>4:
jieba.del_word(big_word)
return jieba.lcut(big_word, HMM=False)
else:
return big_word
def go_subword_list(input_list,result):
for big_word in input_list:
if len(big_word)>4:
subword_list = get_subword_list(big_word)
if isinstance(subword_list,list):
go_subword_list(subword_list,result)
elif isinstance(subword_list,str):
result.append(subword_list)
else:
print("error")
else:
result.append(big_word)
#print(recursive_cut("一二三四五六七八九十"))
#print(recursive_cut("十九八七六五四三二一"))