-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_tweets_per_nghd_word.py
executable file
·78 lines (66 loc) · 2.75 KB
/
get_tweets_per_nghd_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# coding: utf-8
#For each neighborhood and each top 10 words for that neighborhood,
#compiles a list of tweets that contain that word.
from collections import defaultdict
import csv,sys
from csv import DictWriter, DictReader
import util.util, util.neighborhoods, cProfile
from util.neighborhoods import get_neighborhood_or_muni_name
import json, string
import psycopg2, psycopg2.extras, ppygis
import twokenize
def run_all():
csv.field_size_limit(sys.maxsize)
psql_conn = psycopg2.connect("dbname='tweet'")
psycopg2.extras.register_hstore(psql_conn)
pg_cur = psql_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
# Build up the bins-to-nghds mapping so we can easily translate.
bins_to_nghds = {}
for line in DictReader(open('point_map.csv')):
bins_to_nghds[(float(line['lat']), float(line['lon']))] = line['nghd']
words_per_nghd = json.load(open('outputs/nghd_words.json'))
top10words = {}
tweets_per_word = defaultdict(lambda: defaultdict(list))
for nghd in words_per_nghd:
top10words[nghd] = words_per_nghd[nghd]["top words"]
pg_cur.execute("SELECT text, ST_ASGEOJSON(coordinates), user_screen_name " +
"FROM tweet_pgh;")
counter = 0
for row in pg_cur:
counter += 1
if (counter % 10000) == 0:
print str(counter) + ' tweets processed'
coords = json.loads(row[1])['coordinates']
bin = util.util.round_latlon(coords[1], coords[0])
if bin in bins_to_nghds:
tweet_nghd = bins_to_nghds[bin]
else:
tweet_nghd = 'Outside Pittsburgh'
username = row[2]
tweet = row[0]
unchangedTweet = row[0]
tweet = tweet.replace('“','"').replace('”','"')
tweet = tweet.replace('’',"'").replace('‘',"'")
tweet = tweet.replace("…","...")
tweet = tweet.replace("\n","")
#tweet = unicode(tweet, errors='ignore')
#wordList = twokenize.tokenize(tweet)
exclude = set(string.punctuation)
exclude.remove('#')
exclude.remove('-')
exclude.remove("'")
exclude.remove("@")
for punct in exclude:
tweet = tweet.replace(punct,"")
wordList = tweet.split(" ")
wordList = map(lambda x:x.lower(),wordList)
if tweet_nghd in top10words:
for word in top10words[tweet_nghd]:
word = word.encode('utf-8')
if word in wordList:
tweets_per_word[tweet_nghd][word].append(username + ": " + unchangedTweet)
print "writing to JSON file"
with open('outputs/tweets_per_nghd_words.json','w') as outfile:
json.dump(tweets_per_word,outfile, indent=2)
run_all()