-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadd-loc.py
70 lines (56 loc) · 2.12 KB
/
add-loc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
from geopy import geocoders
from geopy import exc
from urllib3 import exceptions
gn = geocoders.GeoNames(username='glouwa')
with open('cache-loc.json', 'r') as infile:
cache = json.load(infile)
def processTweet(tweet):
loc = None
if tweet['place']:
loc = tweet['place']
if tweet['coordinates']:
loc = tweet['coordinates']
if tweet['user']['location']:
loc = tweet['user']['location']
if loc:
loc_str = str(loc)
try:
if loc_str not in cache:
geoloc = gn.geocode(loc)
if geoloc:
cache[loc_str] = {
"address": geoloc.address,
"coord": [geoloc.longitude, geoloc.latitude]
}
else:
cache[loc_str] = None
else:
geoloc = cache[loc_str]
if loc_str in cache and cache[loc_str] != None:
tweet['geo'] = cache[loc_str]['address']
tweet['coordinates'] = cache[loc_str]['coord']
tweets_out.append(tweet)
except (exc.GeocoderTimedOut, exceptions.ReadTimeoutError, exceptions.ProtocolError):
print("IOError")
tweets_out = []
with open('stream-raw-with-geo2.json', 'r') as infile:
tweets_in = json.load(infile)
print(len(tweets_in))
langs = ['ar', 'nl', 'en', 'fr', 'de', 'it', 'ja', 'pt', 'ru', 'es']
lastcachelen = len(cache)
for idx, tweet in enumerate(tweets_in):
if str(tweet['lang']) in langs:
if idx % 100 == 0:
print(idx, len(cache), len(tweets_out))
processTweet(tweet)
if len(cache) > lastcachelen + 100:
with open('cache-loc.json', 'w') as outfile:
json.dump(cache, outfile)
lastcachelen = len(cache)
print("saving cache")
if idx % 10000 == 0:
with open('array-loc.json', 'w') as outfile:
json.dump(tweets_out, outfile)
with open('array-loc.json', 'w') as outfile:
json.dump(tweets_out, outfile)