forked from freelion93/TwitterLanguageAnalyser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd-sentiment.py
66 lines (50 loc) · 1.83 KB
/
add-sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json
import html
import indicoio
indicoio.config.api_key = 'c4d4e9597ef31b91c9e723ebcf363cdf'
tweets_out = []
with open('array-loc.json', 'r') as infile:
tweets_in = json.load(infile)
tweets_out = {}
print(len(tweets_in))
act_langs = {}
data = {}
for idx, tweet in enumerate(tweets_in):
if tweet['coordinates']:
l = tweet['lang']
if l:
act_langs[l] = act_langs.get(l, 0) + 1
tweets_out[str(tweet['id'])] = tweet
if l not in data:
data[l] = {
"id": [],
"txt": []
}
for idx, tweet in enumerate(tweets_in):
if tweet['coordinates']:
l = tweet['lang']
if l:
data[l]['id'].append(str(tweet['id']))
data[l]['txt'].append(html.unescape(tweet['text']))
print(act_langs)
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
for k, v in data.items():
print(len(data[k]['txt']), len(data[k]['id']))
maxchunksize = 10000
txtchunks = list(chunks(data[k]['txt'], maxchunksize))
idchunks = list(chunks(data[k]['id'], maxchunksize))
print(len(txtchunks), len(idchunks))
for cidx, txtchunk in enumerate(txtchunks):
idchunk = idchunks[cidx]
print("nextchunk {}".format(cidx))
sent = indicoio.sentiment(txtchunk, language=k)
emo = indicoio.emotion(txtchunk, language=k)
for idx, s in enumerate(sent):
tweets_out[idchunk[idx]]['sentiment'] = s
tweets_out[idchunk[idx]]['emotion'] = emo[idx]
with open('array-sent.json', 'w') as outfile:
json.dump(tweets_in, outfile, indent=4) # save tweets in to keep order
#indicoio.sentiment(['indico is so easy to use!', 'Still really easy, yiss'], language='')