-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbio_guide.py
173 lines (139 loc) · 5.65 KB
/
bio_guide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
""" Scrapes Congressional Bio Guide and saves images. """
from __future__ import print_function
import argparse
import json
import glob
import os
import shutil
from pymongo import MongoClient
import requests
from requests.packages import urllib3
urllib3.disable_warnings()
def get_blacklist():
""" Reads blacklist and returns it. """
return json.load(open("config/bio_guide_results.json", "r"))
def get_config():
""" Reads config file and returns it. """
config = json.load(open("config/config.json", "r"))
return config
def list_images():
""" Checks images subdirectory for all ICPSRs. """
processed = {x.rsplit("/", 1)[1].split(".")[0]
for x in glob.glob("images/bio_guide/*.*")}
raw = {x.rsplit("/", 1)[1].split(".")[0]
for x in glob.glob("images/raw/bio_guide/*.*")}
return processed | raw
def get_missing_mongo(min_congress):
""" Check which ICPSRs in our query are actually missing from Mongo DB. """
# Connect
config = get_config()
connection = MongoClient(config["db_host"], config["db_port"])
cursor = connection["voteview"]
query = {"bioguide_id": {"$exists": True},
"congress": {"$gte": min_congress}}
present_set = list_images()
blacklist = get_blacklist()
person_set = []
icpsr_set = []
filter_return = {"bioguide_id": 1, "bioname": 1, "congress": 1, "icpsr": 1}
for row in cursor.voteview_members.find(query, filter_return,
no_cursor_timeout=True):
# Because same ICPSR can be recycled, keep a running list of
# viewed ICPSRs
if row.get("icpsr", 0) not in icpsr_set:
new_entry = [str(row["icpsr"]).zfill(6), row["bioguide_id"]]
person_set.append(new_entry)
icpsr_set.append(row["icpsr"])
icpsr_zfill = {x[0] for x in person_set}
missing = icpsr_zfill - present_set - set(blacklist["blacklist"])
return [x for x in person_set if x[0] in missing]
def get_missing_flat(min_congress):
""" Check which ICPSRs in our query are actually missing from flat file. """
# All people
people = json.load(open("config/database-raw.json", "r"))
# Min match
people = [[x["icpsr"], x["bioguide_id"]]
for x in people
if x["congress"] >= min_congress and "bioguide_id" in x]
# Now exclude found results
present_set = set(list_images()) | set(get_blacklist()["blacklist"])
return [x for x in people if str(x[0]).zfill(6) not in present_set]
def save_image(icpsr, extension, data):
""" Simple helper to do a binary file write. """
# Make directory if necessary
full_dir = os.path.dirname("images/raw/bio_guide/")
if not os.path.exists(full_dir):
os.makedirs(full_dir)
# Write the binary data.
with open("images/raw/bio_guide/%s.%s" %
(icpsr, extension), "wb") as out_file:
shutil.copyfileobj(data, out_file)
def individual_lookup(icpsr, bioguide_id):
""" Takes bioguide_id and icpsr and gets image. """
config = get_config()
lookup_url = config["bio_guide_url"]
image_url = "%s/%s.jpg" % (bioguide_id[0], bioguide_id)
# Download image if it exists
file_exists = requests.head(lookup_url + image_url).status_code
if file_exists == 200:
binary_download = requests.get(lookup_url + image_url, stream=True)
save_image(icpsr, "jpg", binary_download.raw)
print("\t OK, downloaded.")
else:
print("\t No image")
def main_loop(db_type, min_congress):
"""
Get missing members and scrape a photo for each of them from the
bio guide.
"""
if db_type == "flat":
missing_icpsrs = get_missing_flat(min_congress)
else:
missing_icpsrs = get_missing_mongo(min_congress)
# Iterate through the set.
i = 1
for person in missing_icpsrs:
# Expand, print
icpsr, bioguide_id = person
print("Lookup for icpsr %s (bio guide ID %s)... %d/%d" %
(icpsr, bioguide_id, i, len(missing_icpsrs)))
individual_lookup(icpsr, bioguide_id)
i = i + 1
def single_download(db_type, icpsr):
""" Download a single ICPSR. """
if db_type == "flat":
people = json.load(open("config/database-raw.json", "r"))
bioguide_id = next(x["bioguide_id"]
for x in people
if x["icpsr"] == icpsr and "bioguide_id" in x)
if bioguide_id:
individual_lookup(str(icpsr).zfill(6), bioguide_id)
else:
print("No bioguide information for ICPSR %s" % icpsr)
return
# Connect
config = get_config()
connection = MongoClient(config["db_host"], config["db_port"])
cursor = connection["voteview"]
query = {"bioguide_id": {"$exists": True}, "icpsr": icpsr}
result = cursor.voteview_members.find_one(query,
{"bioguide_id": 1, "_id": 0})
if result:
individual_lookup(str(icpsr).zfill(6), result["bioguide_id"])
else:
print("No bioguide information for ICPSR %s" % icpsr)
return
def process_arguments():
""" Handles getting the arguments from command line. """
parser = argparse.ArgumentParser(
description="Scrapes Congressional Bioguide for Bio Images")
parser.add_argument("--min", type=int, nargs="?", default=105)
parser.add_argument("--type", type=str, default="mongo", nargs="?")
parser.add_argument("--icpsr", type=int, nargs="?")
arguments = parser.parse_args()
if arguments.icpsr:
single_download(arguments.type, arguments.icpsr)
else:
main_loop(arguments.type, arguments.min)
if __name__ == "__main__":
process_arguments()