-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadj_finder.py
49 lines (31 loc) · 1.1 KB
/
adj_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 12 09:02:26 2022
@author: Audrey
"""
from nltk.corpus.reader.bnc import BNCCorpusReader as bnc
import re
import collections
import pandas as pd
import Preprocessor as pp
from tqdm import tqdm
bnc_reader = bnc(root="BNC/Texts", fileids=r'[A-K]/\w*/\w*\.xml')
rel_adj_list = []
qual_adj_list = []
count = 0
for fileid in tqdm(bnc_reader.fileids()):
file_root = pp.get_root(fileid)
for s in file_root.iter('s'):
for w in s.iter('w'):
if w.get('pos') == 'ADJ' and re.search ('?!.*(al|ary|an|ic)$', w.text.strip()):
qual_adj_list.append(w.text.lower())
qual_counts = collections.Counter(rel_adj_list)
full_rel_adjs =[adj for adj, c in rel_counts.items()]
common_rel_adjs = [adj for adj, c in rel_counts.items() if c>=30]
unique_rel_adjs = [adj for adj, c in rel_counts.items() if c==1]
rel_150=[adj for adj, c in rel_counts.most_common(150)]
adjs = pd.DataFrame(rel_150)
adjs.to_csv('adj_list.csv')
with open('adjectives.txt', 'w') as f:
for item in al_list:
f.write("%s\n" % item)