-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathisni_matching.py
95 lines (83 loc) · 3.21 KB
/
isni_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
import requests
import csv, time, sys, re
import codecs
from bs4 import BeautifulSoup
import urllib
def parse_isni(response):
soup = BeautifulSoup(response.text, 'html.parser')
uri_result_set = soup.find_all('isniuri')
return uri_result_set
def query_isni(name):
root = "http://isni.oclc.nl/sru/DB=1.2/"
params = {
'query':'pica.nw={0}'.format(name),
'operation':'searchRetrieve',
'recordSchema':'isni-b',
'maximumRecords':1000
}
encoded = urllib.urlencode(params)
search_url = root + '?' + encoded
return requests.get(search_url)
def get_record_info(uri):
isni_xml = requests.get(uri.text + '.xml').text
isni_xml_soup = BeautifulSoup(isni_xml, 'html.parser')
print "forename: " + isni_xml_soup.forename.text
print "surname: " + isni_xml_soup.surname.text
def write_csv(filename, content):
with open(filename, 'wb') as output:
output.write(codecs.BOM_UTF8)
writer = csv.writer(output, quoting=csv.QUOTE_ALL,quotechar='"')
writer.writerows(content)
print filename, 'has been created'
# Create lists
matches = [['Name', 'Title', 'Year of Release', 'Item number', 'Number of Matching Records (Total)', 'ISNI Record URIs (top 5 only)']]
non_matches = [['Name', 'Original Titles', 'Year of Release', 'Item number']]
# Get unique names
exclude_names = ["Director", "[uncredited]", "[unknown]"]
name_list = []
# Input filename
if len(sys.argv) > 1:
input_file = sys.argv[1]
else:
print "Please provide a CSV file.\n"
# Clean up any null bytes in csv
fi = open(input_file, 'rb')
data = fi.read()
fi.close()
clean_file = 'clean_' + input_file
fo = open(clean_file, 'wb')
fo.write(data.replace('\x00', ''))
fo.close()
# Read the file
with open(clean_file, 'rb') as name_csv:
reader = csv.reader(name_csv)
for row in enumerate(reader):
name = row[1][3].strip()
if name not in name_list and name not in exclude_names:
name_list += [name]
### For testing ###
# if len(name_list) > 15:
# break
print "\nQuerying ISNI for '{0}'...".format(name)
isni_response = query_isni(name)
uri_result_set = parse_isni(isni_response)
if len(uri_result_set) == 0:
print '-> Zero records found'
non_matches += [[name, row[1][27], row[1][22], row[1][147]]]
elif len(uri_result_set) > 0:
uri_list = []
print "-> {0} records found".format(len(uri_result_set))
for idx, uri in enumerate(uri_result_set):
# Cap the number of URIs output at 5
if idx > 4:
print "-> Too many records! Outputting the first 5 URIs; search for the rest manually.\n"
uri_list += ["..."]
break
uri_list += [str(uri.text)]
# get_record_info(uri)
matches += [[name, row[1][27], row[1][22], row[1][147], len(uri_result_set), (' , ').join(uri_list)]]
# Write non-matches to csv
write_csv("non-matches_" + input_file, non_matches)
# Write matches to CSV
write_csv("matches_" + input_file, matches)