Skip to content

Commit

Permalink
Merge branch 'release/v0.6.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
ja573 committed Jun 21, 2023
2 parents 7a737b1 + 1427b45 commit b80eb02
Show file tree
Hide file tree
Showing 8 changed files with 721 additions and 2 deletions.
9 changes: 9 additions & 0 deletions bookloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,23 @@ class BookLoader():
}
contribution_types = {
"Author": "AUTHOR",
"author": "AUTHOR",
"AUTHOR": "AUTHOR",
"AUHTOR": "AUTHOR",
"A01": "AUTHOR",
"Editor": "EDITOR",
"EDITOR": "EDITOR",
"B01": "EDITOR",
"B02": "EDITOR",
"C99": "EDITOR",
"Translator": "TRANSLATOR",
"B06": "TRANSLATOR",
"Foreword": "FOREWORD_BY",
"Introduction": "INTRODUCTION_BY",
"Preface": "PREFACE_BY",
"Music editor": "MUSIC_EDITOR"
}

main_contributions = ["AUTHOR", "EDITOR", "TRANSLATOR"]
orcid_regex = re.compile(
r'0000-000(1-[5-9]|2-[0-9]|3-[0-4])\d{3}-\d{3}[\dX]')
Expand Down Expand Up @@ -183,6 +190,8 @@ def sanitise_isbn(isbn):
if "-" in str(isbn):
return str(isbn)
return isbn_hyphenate.hyphenate(str(int(isbn)))
except ValueError:
return None
except isbn_hyphenate.IsbnMalformedError:
print(isbn)
raise
Expand Down
26 changes: 26 additions & 0 deletions crossref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Expose Crossref API"""
import sys
import requests
import logging


class CrossrefClient:
"""Crossref API client"""
endpoint = "https://api.crossref.org"
retry_count = 0

def get_doi(self, doi):
try:
url = "%s/works/%s" % (self.endpoint, doi)
res = requests.get(url)
if res.status_code == 404:
return False
self.retry_count = 0
return res.json()['message']
except requests.exceptions.RequestException as error:
logging.error('Error (%s) querying %s' % (res.status_code, doi))
if self.retry_count <= 5:
self.retry_count += 1
logging.error('Retrying (%s) querying %s' % (self.retry_count, doi))
return self.get_doi(doi)
sys.exit(1)
135 changes: 135 additions & 0 deletions crossrefchapterloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""Load chapter metadata from Crossref into Thoth"""

import re
import sys

import logging
import thothlibrary
from crossref import CrossrefClient


class CrossrefChapterLoader:
"""Generic logic to ingest chapter metadata from CSV into Thoth"""
single_imprint = True
publisher_name = None
all_contributors = {}
all_institutions = {}
all_imprints = {}
encoding = "utf-8"
header = 0
separation = ","
orcid_regex = re.compile(
r'0000-000(1-[5-9]|2-[0-9]|3-[0-4])\d{3}-\d{3}[\dX]')

def __init__(self, metadata_file, client_url, email, password):
self.thoth = thothlibrary.ThothClient(client_url)
self.thoth.login(email, password)
self.crossref = CrossrefClient()

publishers = self.thoth.publishers(search=self.publisher_name)
try:
self.publisher_id = publishers[0].publisherId
except (IndexError, AttributeError):
logging.error('Publisher not found: %s' % self.publisher_name)
sys.exit(1)
try:
for imprint in publishers[0].imprints:
self.all_imprints[imprint.imprintName] = imprint.imprintId
if self.single_imprint:
self.imprint_id = publishers[0].imprints[0].imprintId
except (IndexError, AttributeError):
logging.error('No imprints associated with publisher: %s' % self.publisher_name)
sys.exit(1)

# create cache of all existing contributors
for c in self.thoth.contributors(limit=99999):
self.all_contributors[c.fullName] = c.contributorId
if c.orcid:
self.all_contributors[c.orcid] = c.contributorId
# create cache of all existing institutions
for i in self.thoth.institutions(limit=99999):
self.all_institutions[i.institutionName] = i.institutionId
if i.ror:
self.all_institutions[i.ror] = i.institutionId

def get_crossref_metadata(self, doi):
return self.crossref.get_doi(doi)

def doi_in_thoth(self, doi):
try:
self.thoth.work_by_doi(doi=doi)
return True
except thothlibrary.errors.ThothError:
return False

def get_book_by_title(self, title):
"""Query Thoth to find a book given its title"""
try:
books = self.thoth.books(search=title.replace('"', '\\"'), publishers='"%s"' % self.publisher_id)
return books[0]
except (IndexError, AttributeError):
logging.error('Book not found: \'%s\'' % title)
sys.exit(1)

def all_books(self):
return self.thoth.books(limit=99999, publishers='"%s"' % self.publisher_id, work_status="ACTIVE")

def create_chapter_relation(self, book_work_id, chapter_work_id, relation_ordinal):
"""Create a work relation of type HAS_CHILD"""
work_relation = {
"relatorWorkId": book_work_id,
"relatedWorkId": chapter_work_id,
"relationType": "HAS_CHILD",
"relationOrdinal": relation_ordinal
}
return self.thoth.create_work_relation(work_relation)

@staticmethod
def simple_doi(doi):
return doi.replace("https://doi.org/", "").replace("http://dx.doi.org/", "")

@staticmethod
def full_doi(doi):
return "https://doi.org/%s" % CrossrefChapterLoader.simple_doi(doi)

@staticmethod
def roman_to_decimal(roman):
def roman_value(val):
r = val.upper()
if r == 'I':
return 1
if r == 'V':
return 5
if r == 'X':
return 10
if r == 'L':
return 50
if r == 'C':
return 100
if r == 'D':
return 500
if r == 'M':
return 1000
return -1

try:
decimal = int(roman)
return decimal
except ValueError:
pass
res = 0
i = 0
while i < len(roman):
s1 = roman_value(roman[i])
if i + 1 < len(roman):
s2 = roman_value(roman[i + 1])
if s1 >= s2:
res = res + s1
i = i + 1
else:
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
6 changes: 6 additions & 0 deletions loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
import argparse
import logging
from obploader import OBPBookLoader
from obpchapterloader import ObpChapterLoader
from obpchapterabstractloader import ObpChapterAbstractLoader
from punctumloader import PunctumBookLoader
from punctumchapterloader import PunctumChapterLoader
from africanmindsloader import AfricanMindsBookLoader
from whploader import WHPLoader

LOADERS = {
"OBP": OBPBookLoader,
"OBP-chapters": ObpChapterLoader,
"OBP-chapter-abstracts": ObpChapterAbstractLoader,
"punctum": PunctumBookLoader,
"punctum-chapters": PunctumChapterLoader,
"AM": AfricanMindsBookLoader,
"WHP": WHPLoader,
}

ARGS = [
Expand Down
88 changes: 88 additions & 0 deletions obpchapterabstractloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Load a CSV file containing chapter abstracts into Thoth"""

import sys

import numpy as np
import pandas as pd
import logging

import thothlibrary
from thothlibrary import ThothClient

from crossrefchapterloader import CrossrefChapterLoader


class Deduper: # pylint: disable=too-few-public-methods
"""Dummy class to rename duplicate columns in a CSV file"""
headers = dict()

def __call__(self, header):
"""Append an increasing counter to columns that repeat its header"""
if header not in self.headers:
self.headers[header] = 0
return header
self.headers[header] += 1
return "%s %d" % (header, self.headers[header])


class ObpChapterAbstractLoader:
"""Logic to ingest OBP chapter abstracts from CSV into Thoth"""
publisher_name = "Open Book Publishers"
encoding = "utf-8"
header = 0
separation = ","

def __init__(self, metadata_file, client_url, email, password):
self.metadata_file = metadata_file
self.thoth = ThothClient(client_url)
self.thoth.login(email, password)

self.data = self.prepare_file()
publishers = self.thoth.publishers(search=self.publisher_name)
try:
self.publisher_id = publishers[0].publisherId
except (IndexError, AttributeError):
logging.error('Publisher not found: %s' % self.publisher_name)
sys.exit(1)
try:
self.imprint_id = publishers[0].imprints[0].imprintId
except (IndexError, AttributeError):
logging.error('No imprints associated with publisher: %s' % self.publisher_name)
sys.exit(1)

def run(self):
for row in self.data.index:
doi = self.data.at[row, "DOI"]
abstract = self.data.at[row, "Content"]
if not doi or not abstract:
continue
simple_doi = CrossrefChapterLoader.simple_doi(doi).strip()
full_doi = CrossrefChapterLoader.full_doi(simple_doi)

try:
work = self.thoth.work_by_doi(doi=full_doi)
except thothlibrary.errors.ThothError:
logging.warning('DOI not in Thoth: %s' % full_doi)
continue
if work['workType'] != "BOOK_CHAPTER":
logging.warning('Not a chapter: %s' % simple_doi)
continue
abstract = abstract.strip()
# Some abstracts contain multiple lines by mistake
line_count = len(abstract.split("\n"))
if line_count > 6:
abstract = abstract.replace("\n", " ").replace(" ", " ")
if work['longAbstract']:
logging.info('Abstract already in Thoth: %s' % simple_doi)
continue
work['longAbstract'] = abstract
self.thoth.update_work(work)

def prepare_file(self):
"""Read CSV, convert empties to None and rename duplicate columns"""
frame = pd.read_csv(self.metadata_file, encoding=self.encoding,
header=self.header, sep=self.separation)
frame = frame.where(pd.notnull(frame), None)
frame = frame.replace({np.nan: None})
frame = frame.rename(columns=Deduper())
return frame
Loading

0 comments on commit b80eb02

Please sign in to comment.