-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathonix3.py
448 lines (390 loc) · 19.3 KB
/
onix3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
"""Parse an ONIX 3.0 Product"""
import re
import logging
from onix.book.v3_0.reference.strict import Product, Contributor, NamesBeforeKey, KeyNames, \
PersonName, ProfessionalAffiliation, ProfessionalPosition, Affiliation, TitleElement, \
TitleText, TitlePrefix, TitleWithoutPrefix, Subtitle, EditionNumber, Collection, Publisher
from bookloader import BookLoader
class Onix3Record:
"""Generic logic to extract data from an ONIX 3.0 product record"""
def __init__(self, product: Product):
self._product = product
def title(self):
title_element = self._product.descriptive_detail.title_detail[0].title_element[0]
(title, subtitle) = self.get_title_and_subtitle(title_element)
return BookLoader.sanitise_title(title, subtitle)
def doi(self):
dois = [ident.idvalue.value for ident in self._product.product_identifier
if ident.product_idtype.value.value == "06"]
try:
return f"https://doi.org/{dois[0]}"
except IndexError:
logging.error(f"No DOI found: {self._product.record_reference}")
raise
def isbn(self):
isbns = [ident.idvalue.value for ident in self._product.product_identifier
if ident.product_idtype.value.value == "15"]
try:
return BookLoader.sanitise_isbn(isbns[0])
except IndexError:
logging.error("No ISBN found")
raise
def work_type(self):
contributors = self._product.descriptive_detail.contributor_or_contributor_statement_or_no_contributor
roles = [role.value.value for contributor in contributors
for role in getattr(contributor, 'contributor_role', [])]
if roles[0] == "B01":
return "EDITED_BOOK"
else:
return "MONOGRAPH"
def short_abstract(self):
try:
return [text.text[0].content[0] for text in self._product.collateral_detail.text_content
if text.text_type.value.value == "02"][0]
except IndexError:
return None
def long_abstract(self):
try:
return [text.text[0].content[0] for text in self._product.collateral_detail.text_content
if text.text_type.value.value == "03"][0]
except IndexError:
return None
def toc(self):
try:
return [text.text[0].content[0] for text in self._product.collateral_detail.text_content
if text.text_type.value.value == "04"][0]
except IndexError:
return None
def reference(self):
return self._product.record_reference.value
def license(self):
try:
return [cc.epub_license_expression_link.value
for cc in self._product.descriptive_detail.epub_license.epub_license_expression
if cc.epub_license_expression_type.value.value in ["01", "02"]][0]
except AttributeError:
return None
def cover_url(self):
resources = self._product.collateral_detail.supporting_resource
try:
return [resource.resource_version[0].resource_link[0].value for resource in resources
if resource.resource_content_type.value.value == "01"][0]
except IndexError:
return None
def publication_place(self):
city = self._product.publishing_detail.city_of_publication
country = self._product.publishing_detail.country_of_publication
try:
return "{}, {}".format(city[0].value, country.value.value)
except IndexError:
try:
return country.value.value
except AttributeError:
return None
except AttributeError:
return city[0].value
def publication_date(self):
# Fall back to 19 Publication date of print counterpart
# if 01 Publication date is missing
return BookLoader.sanitise_date(
[pub_date.date.value
for pub_date in self._product.publishing_detail.publishing_date
if pub_date.publishing_date_role.value.value in ["01", "19"]][0])
def copyright_holder(self):
try:
return [name.value
for statement in self._product.publishing_detail.copyright_statement
for owner in statement.copyright_year_or_copyright_owner
for name in getattr(owner, 'person_name_or_corporate_name', [])][0]
except IndexError:
return None
def work_status(self):
return self._product.publishing_detail.publishing_status.value.value
def oapen_url(self):
locations = self._product.product_supply[0].supply_detail
oapen = [location.supplier.website[0].website_link[0].value for location in locations
if location.supplier.supplier_identifier_or_supplier_name[0].value == "DOAB Library"]
return oapen[0]
def page_count(self):
page_count = [extent.extent_value.value for extent in self._product.descriptive_detail.extent
if extent.extent_type.value.value in ["00", "06", "07", "08", "10", "11"]]
try:
return int(page_count[0])
except IndexError:
return None
def illustration_count(self):
number_of_illustrations = self._product.descriptive_detail.number_of_illustrations
if number_of_illustrations is not None:
return number_of_illustrations.value
else:
try:
# Get total number of illustrations from <IllustrationsNote>, which is of the form e.g. 10 bw illus"""
illustrations_note = self._product.descriptive_detail.illustrations_note[
0].content[0]
numbers = re.findall(r'\d+', illustrations_note)
total = sum(int(number) for number in numbers)
return total
except IndexError:
return None
def edition_number(self):
edition_number = [e.value for e in self._product.descriptive_detail.choice
if type(e) is EditionNumber]
try:
return int(edition_number[0])
except IndexError:
return None
def contributors(self):
return [c for c in self._product.descriptive_detail.contributor_or_contributor_statement_or_no_contributor
if type(c) is Contributor]
def serieses(self):
return [c for c in self._product.descriptive_detail.collection_or_no_collection
if type(c) is Collection]
def fundings(self):
return [p for p in self._product.publishing_detail.imprint_or_publisher
if type(p) is Publisher and p.publishing_role.value.value in ["14", "15", "16"]]
def language_code(self):
return self._product.descriptive_detail.language[0].language_code.value.value.upper()
def language_codes_and_roles(self):
languages = self._product.descriptive_detail.language
language_codes_and_roles = []
unsupported = next(
(x for x in languages if x.language_role.value.value not in ["01", "02"]), None)
if unsupported is not None:
raise KeyError("Unsupported language role: %s" %
unsupported.language_role.value.value)
if next((x for x in languages if x.language_role.value.value == "02"), None) is not None:
language_codes_and_roles = [(language.language_code.value.value.upper(), "TRANSLATED_FROM")
for language in languages if language.language_role.value.value == "02"]
language_codes_and_roles.extend([(language.language_code.value.value.upper(), "TRANSLATED_INTO")
for language in languages if language.language_role.value.value == "01"])
else:
language_codes_and_roles = [(language.language_code.value.value.upper(), "ORIGINAL")
for language in languages]
return language_codes_and_roles
def bic_codes(self):
subjects = self._product.descriptive_detail.subject
return [subject.subject_code_or_subject_heading_text[0].value for subject in subjects
if subject.subject_scheme_identifier.value.value in ["12", "13", "14", "15"]]
def bisac_codes(self):
subjects = self._product.descriptive_detail.subject
return [subject.subject_code_or_subject_heading_text[0].value for subject in subjects
if subject.subject_scheme_identifier.value.value in ["10", "11", "22"]]
def custom_codes(self):
subjects = self._product.descriptive_detail.subject
return [subject.subject_code_or_subject_heading_text[0].value for subject in subjects
if subject.subject_scheme_identifier.value.value == "23"]
def keywords(self):
subjects = self._product.descriptive_detail.subject
return [subject.subject_code_or_subject_heading_text[0].value for subject in subjects
if subject.subject_scheme_identifier.value.value == "20"]
def keywords_from_text(self):
"""Used on subjects where SubjectHeadingText is used instead of SubjectCode"""
return [keyword for all_keywords in self.keywords() for keyword
in all_keywords.replace(',', ';').replace('; ', ';').split(';')]
def thema_codes(self):
subjects = self._product.descriptive_detail.subject
return [subject.subject_code_or_subject_heading_text[0].value for subject in subjects
if subject.subject_scheme_identifier.value.value in ["93", "94", "95", "96", "97", "98", "99"]]
def prices(self):
return [(price.currency_code.value.value, str(price.price_amount.value))
for product_supply in self._product.product_supply
for supply_detail in product_supply.supply_detail
for price in supply_detail.unpriced_item_type_or_price
if hasattr(price, 'price_amount') and price.price_amount is not None
and str(price.price_amount.value) != "0.00"]
def dimensions(self):
return [(m.measure_type.value.value, m.measure_unit_code.value.value, str(m.measurement.value))
for m in self._product.descriptive_detail.measure]
def related_biblio_work_id(self):
related = [ident.idvalue.value for ident in self._product.related_material.related_work[0].work_identifier
if ident.idtype_name.value == "Biblio Work ID"]
return related[0]
def related_system_internal_identifier(self):
return [ident.idvalue.value for work in self._product.related_material.related_work
for ident in work.work_identifier
if ident.idtype_name is not None and ident.idtype_name.value == "system-internal-identifier"][0]
def alternative_formats(self):
related_products = [n for n in self._product.related_material.related_product
if n.product_relation_code[0].value.value in ["06", "13"]]
alternative_formats = []
for product in related_products:
isbn = None
product_type = None
try:
isbn = BookLoader.sanitise_isbn([ident.idvalue.value for ident in product.product_identifier
if ident.product_idtype.value.value == "15"][0])
except Exception:
logging.warning("Invalid ISBN for alternative format")
if isbn:
try:
product_type = product.product_form_detail[0].value.value
# Check that this ONIX code is one we can unambiguously convert to a Thoth publication type
BookLoader.publication_types[product_type]
except (IndexError, KeyError):
product_type = product.product_form.value.value
alternative_formats.append((product_type, isbn))
return alternative_formats
def product_type(self):
try:
product_type = self._product.descriptive_detail.product_form_detail[0].value.value
# Check that this ONIX code is one we can unambiguously convert to a Thoth publication type
BookLoader.publication_types[product_type]
return product_type
except (IndexError, KeyError):
try:
product_type = self._product.descriptive_detail.product_form_description[
0].value
# Check that this ONIX code is one we can unambiguously convert to a Thoth publication type
BookLoader.publication_types[product_type]
return product_type
except (IndexError, KeyError):
return self._product.descriptive_detail.product_form.value.value
def available_content_url(self):
try:
return [website_link.value
for publisher in self._product.publishing_detail.imprint_or_publisher
for website in getattr(publisher, 'website', [])
for website_link in website.website_link
if website.website_role.value.value == "02"][0]
except IndexError:
return None
def full_text_urls(self):
urls = [website_link.value
for publisher in self._product.publishing_detail.imprint_or_publisher
for website in getattr(publisher, 'website', [])
for website_link in website.website_link
if website.website_role.value.value == "29"]
# May also be provided within SupplyDetail
urls.extend([website_link.value
for supply in self._product.product_supply
for detail in supply.supply_detail
for website in getattr(detail.supplier, 'website', [])
for website_link in website.website_link])
# Deduplicate
return list(set(urls))
@staticmethod
def get_key_names(contributor: Contributor):
return [name.value for name in contributor.choice
if type(name) is KeyNames][0]
@staticmethod
def get_names_before_key(contributor: Contributor):
return [name.value for name in contributor.choice
if type(name) is NamesBeforeKey][0]
@staticmethod
def get_person_name(contributor: Contributor):
return [name.value for name in contributor.choice
if type(name) is PersonName][0]
@staticmethod
def get_affiliation(contributor: Contributor):
logging.info(contributor.choice_1)
return [affiliation.value
for professional_affiliation in contributor.choice_1
for affiliation in getattr(professional_affiliation, 'professional_position_or_affiliation', [])]
@staticmethod
def get_affiliations_with_positions(contributor: Contributor):
affiliations = [affiliation
for affiliation in contributor.choice_1
if type(affiliation) is ProfessionalAffiliation]
affiliations_with_positions = []
for affiliation in affiliations:
try:
position = [position.value
for position in getattr(affiliation, 'professional_position_or_affiliation', [])
if type(position) is ProfessionalPosition][0]
except IndexError:
position = None
try:
institution = [institution.value
for institution in getattr(affiliation, 'professional_position_or_affiliation', [])
if type(institution) is Affiliation][0]
except IndexError:
institution = None
affiliations_with_positions.append((position, institution))
return affiliations_with_positions
@staticmethod
def get_biography(contributor: Contributor):
try:
return [content
for biographical_note in contributor.choice_1
for content in getattr(biographical_note, 'content', [])][0]
except IndexError:
return None
@staticmethod
def get_website(contributor: Contributor):
try:
return [link.value
for website in contributor.choice_1
for link in getattr(website, 'website_link', [])][0]
except IndexError:
return None
@staticmethod
def get_orcid(contributor: Contributor):
try:
orcid_digits = [name_identifier.idvalue.value
for name_identifier in contributor.name_identifier
if name_identifier.name_idtype.value.value == "21"][0]
except IndexError:
return None
orcid_hyphenated = '-'.join(orcid_digits[i:i+4]
for i in range(0, len(orcid_digits), 4))
return BookLoader.sanitise_orcid(orcid_hyphenated)
@staticmethod
def get_series_name(series: Collection):
title_element = series.title_detail[0].title_element[0]
(title, subtitle) = Onix3Record.get_title_and_subtitle(title_element)
if subtitle is not None:
return ': '.join([title, subtitle])
else:
return title
@staticmethod
def get_issn(series: Collection):
try:
return [id.idvalue.value
for id in getattr(series, 'collection_identifier', [])][0]
except IndexError:
return None
@staticmethod
def get_issue_ordinal(series: Collection):
try:
return int([seq.collection_sequence_number.value
for seq in getattr(series, 'collection_sequence', [])][0])
except ValueError:
# Sequences may be of the format "2.1"
return int(float([seq.collection_sequence_number.value
for seq in getattr(series, 'collection_sequence', [])][0]))
except IndexError:
try:
part_number = [element.part_number.value
for detail in series.title_detail
for element in detail.title_element
if element.part_number is not None][0]
return int(re.findall(r'\d+', part_number)[0])
except IndexError:
return None
@staticmethod
def get_title_and_subtitle(element: TitleElement):
prefix = None
title_without_prefix = None
title_text = None
subtitle = None
for title_part in element.choice:
if type(title_part) is TitlePrefix:
prefix = title_part.value
if type(title_part) is TitleWithoutPrefix:
title_without_prefix = title_part.value
if type(title_part) is TitleText:
title_text = title_part.value
if type(title_part) is Subtitle:
subtitle = title_part.value
if title_text is not None:
# Assume this is the canonical title
title = title_text
elif title_without_prefix is not None:
# Likewise
title = title_without_prefix
else:
# Pick first element and hope for the best
title = element.choice[0].value
if prefix is not None:
title = ' '.join([prefix, title])
return (title, subtitle)