-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataParser.py
102 lines (87 loc) · 3.35 KB
/
DataParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import json
class DataParser:
def __init__(self, data_path, csv_filename):
self.data_path = data_path
self.user_ids = []
self.label = {}
self.fold = {}
self.tweets = {}
self.user_ids_from_fold = {}
self.user_ids_from_label = {}
for i in range(1, 11):
self.user_ids_from_fold[i] = []
self.user_ids_from_fold[-i] = []
self.user_ids_from_label[1] = []
self.user_ids_from_label[-1] = []
for line in open(csv_filename).readlines()[1:]:
user_id = line.split(',')[0].strip()
self.user_ids.append(user_id)
label = line.split(',')[1]
if line.split(',')[1] == 'schizophrenia':
label = 1
else:
label = -1
fold = int(line.split(',')[5]) + 1
self.label[user_id] = label
self.fold[user_id] = fold
self.user_ids_from_label[label].append(user_id)
for i in range(1, 11):
if i == fold:
self.user_ids_from_fold[i].append(user_id)
else:
self.user_ids_from_fold[-i].append(user_id)
def get_label(self, user_id):
return self.label[user_id]
def get_fold(self, user_id):
return self.fold[user_id]
def get_users(self):
return self.user_ids
def get_tweets(self, user_id, ext='tweets'):
if self.tweets.get(user_id) is None:
result = []
tweets = open('%s/%s.%s' % (self.data_path, user_id, ext)) \
.readlines()
for tweet in tweets:
tweet_data = json.loads(tweet)
if not tweet_data['text'].startswith('RT'):
result.append(self.preprocess_tweet(tweet_data['text']))
self.tweets[user_id] = result
return self.tweets[user_id]
def get_tweets_by_label(self, label):
result = []
user_ids = self.user_ids_from_label[label]
for user_id in user_ids:
result.extend(self.get_tweets(user_id))
return result
def get_tweets_by_fold(self, fold):
result = []
user_ids = self.user_ids_from_fold[fold]
for user_id in user_ids:
result.extend(self.get_tweets(user_id))
return result
@staticmethod
def preprocess_tweet(tweet):
tweet = re.sub(r'http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]| \
(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
tweet = re.sub(r'@[a-zA-Z0-9_]*', '', tweet)
tweet = tweet.replace('\n', ' ')
tweet = tweet.replace(':', 'COLON').replace('|', 'PIPE')
tweet = tweet.replace('schizophrenia', '')
tweet = tweet.replace('schizo', '')
tweet = tweet.replace('skitzo', '')
tweet = tweet.replace('skitso', '')
tweet = tweet.replace('schizotypal', '')
tweet = tweet.replace('schizoid', '')
tweet = tweet.lower()
return tweet
if __name__ == '__main__':
dp = DataParser('../data/schizophrenia/',
'../data/schizophrenia/anonymized_user_manifest.csv')
user_ids = dp.get_users()
for curr_id in user_ids:
fp = open('%s.txt' % (curr_id), 'w')
tweets = dp.get_tweets(curr_id)
for t in tweets:
fp.write(t.encode('utf-8').strip() + '\n')
fp.close()