Merge branch 'release/v0.6.0'

thoth-pub · Jun 21, 2023 · b80eb02 · b80eb02
2 parents 7a737b1 + 1427b45
commit b80eb02
Show file tree

Hide file tree

Showing 8 changed files with 721 additions and 2 deletions.
diff --git a/bookloader.py b/bookloader.py
@@ -51,16 +51,23 @@ class BookLoader():
     }
     contribution_types = {
         "Author": "AUTHOR",
+        "author": "AUTHOR",
         "AUTHOR": "AUTHOR",
         "AUHTOR": "AUTHOR",
+        "A01": "AUTHOR",
         "Editor": "EDITOR",
         "EDITOR": "EDITOR",
+        "B01": "EDITOR",
+        "B02": "EDITOR",
+        "C99": "EDITOR",
         "Translator": "TRANSLATOR",
+        "B06": "TRANSLATOR",
         "Foreword": "FOREWORD_BY",
         "Introduction": "INTRODUCTION_BY",
         "Preface": "PREFACE_BY",
         "Music editor": "MUSIC_EDITOR"
     }
+
     main_contributions = ["AUTHOR", "EDITOR", "TRANSLATOR"]
     orcid_regex = re.compile(
         r'0000-000(1-[5-9]|2-[0-9]|3-[0-4])\d{3}-\d{3}[\dX]')
@@ -183,6 +190,8 @@ def sanitise_isbn(isbn):
             if "-" in str(isbn):
                 return str(isbn)
             return isbn_hyphenate.hyphenate(str(int(isbn)))
+        except ValueError:
+            return None
         except isbn_hyphenate.IsbnMalformedError:
             print(isbn)
             raise

diff --git a/crossref.py b/crossref.py
@@ -0,0 +1,26 @@
+"""Expose Crossref API"""
+import sys
+import requests
+import logging
+
+
+class CrossrefClient:
+    """Crossref API client"""
+    endpoint = "https://api.crossref.org"
+    retry_count = 0
+
+    def get_doi(self, doi):
+        try:
+            url = "%s/works/%s" % (self.endpoint, doi)
+            res = requests.get(url)
+            if res.status_code == 404:
+                return False
+            self.retry_count = 0
+            return res.json()['message']
+        except requests.exceptions.RequestException as error:
+            logging.error('Error (%s) querying %s' % (res.status_code, doi))
+            if self.retry_count <= 5:
+                self.retry_count += 1
+                logging.error('Retrying (%s) querying %s' % (self.retry_count, doi))
+                return self.get_doi(doi)
+            sys.exit(1)
diff --git a/crossrefchapterloader.py b/crossrefchapterloader.py
@@ -0,0 +1,135 @@
+"""Load chapter metadata from Crossref into Thoth"""
+
+import re
+import sys
+
+import logging
+import thothlibrary
+from crossref import CrossrefClient
+
+
+class CrossrefChapterLoader:
+    """Generic logic to ingest chapter metadata from CSV into Thoth"""
+    single_imprint = True
+    publisher_name = None
+    all_contributors = {}
+    all_institutions = {}
+    all_imprints = {}
+    encoding = "utf-8"
+    header = 0
+    separation = ","
+    orcid_regex = re.compile(
+        r'0000-000(1-[5-9]|2-[0-9]|3-[0-4])\d{3}-\d{3}[\dX]')
+
+    def __init__(self, metadata_file, client_url, email, password):
+        self.thoth = thothlibrary.ThothClient(client_url)
+        self.thoth.login(email, password)
+        self.crossref = CrossrefClient()
+
+        publishers = self.thoth.publishers(search=self.publisher_name)
+        try:
+            self.publisher_id = publishers[0].publisherId
+        except (IndexError, AttributeError):
+            logging.error('Publisher not found: %s' % self.publisher_name)
+            sys.exit(1)
+        try:
+            for imprint in publishers[0].imprints:
+                self.all_imprints[imprint.imprintName] = imprint.imprintId
+            if self.single_imprint:
+                self.imprint_id = publishers[0].imprints[0].imprintId
+        except (IndexError, AttributeError):
+            logging.error('No imprints associated with publisher: %s' % self.publisher_name)
+            sys.exit(1)
+
+        # create cache of all existing contributors
+        for c in self.thoth.contributors(limit=99999):
+            self.all_contributors[c.fullName] = c.contributorId
+            if c.orcid:
+                self.all_contributors[c.orcid] = c.contributorId
+        # create cache of all existing institutions
+        for i in self.thoth.institutions(limit=99999):
+            self.all_institutions[i.institutionName] = i.institutionId
+            if i.ror:
+                self.all_institutions[i.ror] = i.institutionId
+
+    def get_crossref_metadata(self, doi):
+        return self.crossref.get_doi(doi)
+
+    def doi_in_thoth(self, doi):
+        try:
+            self.thoth.work_by_doi(doi=doi)
+            return True
+        except thothlibrary.errors.ThothError:
+            return False
+
+    def get_book_by_title(self, title):
+        """Query Thoth to find a book given its title"""
+        try:
+            books = self.thoth.books(search=title.replace('"', '\\"'), publishers='"%s"' % self.publisher_id)
+            return books[0]
+        except (IndexError, AttributeError):
+            logging.error('Book not found: \'%s\'' % title)
+            sys.exit(1)
+
+    def all_books(self):
+        return self.thoth.books(limit=99999, publishers='"%s"' % self.publisher_id, work_status="ACTIVE")
+
+    def create_chapter_relation(self, book_work_id, chapter_work_id, relation_ordinal):
+        """Create a work relation of type HAS_CHILD"""
+        work_relation = {
+            "relatorWorkId": book_work_id,
+            "relatedWorkId": chapter_work_id,
+            "relationType": "HAS_CHILD",
+            "relationOrdinal": relation_ordinal
+        }
+        return self.thoth.create_work_relation(work_relation)
+
+    @staticmethod
+    def simple_doi(doi):
+        return doi.replace("https://doi.org/", "").replace("http://dx.doi.org/", "")
+
+    @staticmethod
+    def full_doi(doi):
+        return "https://doi.org/%s" % CrossrefChapterLoader.simple_doi(doi)
+
+    @staticmethod
+    def roman_to_decimal(roman):
+        def roman_value(val):
+            r = val.upper()
+            if r == 'I':
+                return 1
+            if r == 'V':
+                return 5
+            if r == 'X':
+                return 10
+            if r == 'L':
+                return 50
+            if r == 'C':
+                return 100
+            if r == 'D':
+                return 500
+            if r == 'M':
+                return 1000
+            return -1
+
+        try:
+            decimal = int(roman)
+            return decimal
+        except ValueError:
+            pass
+        res = 0
+        i = 0
+        while i < len(roman):
+            s1 = roman_value(roman[i])
+            if i + 1 < len(roman):
+                s2 = roman_value(roman[i + 1])
+                if s1 >= s2:
+                    res = res + s1
+                    i = i + 1
+                else:
+                    res = res + s2 - s1
+                    i = i + 2
+            else:
+                res = res + s1
+                i = i + 1
+        return res
diff --git a/loader.py b/loader.py
@@ -7,15 +7,21 @@
 import argparse
 import logging
 from obploader import OBPBookLoader
+from obpchapterloader import ObpChapterLoader
+from obpchapterabstractloader import ObpChapterAbstractLoader
 from punctumloader import PunctumBookLoader
 from punctumchapterloader import PunctumChapterLoader
 from africanmindsloader import AfricanMindsBookLoader
+from whploader import WHPLoader
 
 LOADERS = {
     "OBP": OBPBookLoader,
+    "OBP-chapters": ObpChapterLoader,
+    "OBP-chapter-abstracts": ObpChapterAbstractLoader,
     "punctum": PunctumBookLoader,
     "punctum-chapters": PunctumChapterLoader,
     "AM": AfricanMindsBookLoader,
+    "WHP": WHPLoader,
 }
 
 ARGS = [

diff --git a/obpchapterabstractloader.py b/obpchapterabstractloader.py
@@ -0,0 +1,88 @@
+"""Load a CSV file containing chapter abstracts into Thoth"""
+
+import sys
+
+import numpy as np
+import pandas as pd
+import logging
+
+import thothlibrary
+from thothlibrary import ThothClient
+
+from crossrefchapterloader import CrossrefChapterLoader
+
+
+class Deduper:  # pylint: disable=too-few-public-methods
+    """Dummy class to rename duplicate columns in a CSV file"""
+    headers = dict()
+
+    def __call__(self, header):
+        """Append an increasing counter to columns that repeat its header"""
+        if header not in self.headers:
+            self.headers[header] = 0
+            return header
+        self.headers[header] += 1
+        return "%s %d" % (header, self.headers[header])
+
+
+class ObpChapterAbstractLoader:
+    """Logic to ingest OBP chapter abstracts from CSV into Thoth"""
+    publisher_name = "Open Book Publishers"
+    encoding = "utf-8"
+    header = 0
+    separation = ","
+
+    def __init__(self, metadata_file, client_url, email, password):
+        self.metadata_file = metadata_file
+        self.thoth = ThothClient(client_url)
+        self.thoth.login(email, password)
+
+        self.data = self.prepare_file()
+        publishers = self.thoth.publishers(search=self.publisher_name)
+        try:
+            self.publisher_id = publishers[0].publisherId
+        except (IndexError, AttributeError):
+            logging.error('Publisher not found: %s' % self.publisher_name)
+            sys.exit(1)
+        try:
+            self.imprint_id = publishers[0].imprints[0].imprintId
+        except (IndexError, AttributeError):
+            logging.error('No imprints associated with publisher: %s' % self.publisher_name)
+            sys.exit(1)
+
+    def run(self):
+        for row in self.data.index:
+            doi = self.data.at[row, "DOI"]
+            abstract = self.data.at[row, "Content"]
+            if not doi or not abstract:
+                continue
+            simple_doi = CrossrefChapterLoader.simple_doi(doi).strip()
+            full_doi = CrossrefChapterLoader.full_doi(simple_doi)
+
+            try:
+                work = self.thoth.work_by_doi(doi=full_doi)
+            except thothlibrary.errors.ThothError:
+                logging.warning('DOI not in Thoth: %s' % full_doi)
+                continue
+            if work['workType'] != "BOOK_CHAPTER":
+                logging.warning('Not a chapter: %s' % simple_doi)
+                continue
+            abstract = abstract.strip()
+            # Some abstracts contain multiple lines by mistake
+            line_count = len(abstract.split("\n"))
+            if line_count > 6:
+                abstract = abstract.replace("\n", " ").replace("  ", " ")
+            if work['longAbstract']:
+                logging.info('Abstract already in Thoth: %s' % simple_doi)
+                continue
+            work['longAbstract'] = abstract
+            self.thoth.update_work(work)
+
+    def prepare_file(self):
+        """Read CSV, convert empties to None and rename duplicate columns"""
+        frame = pd.read_csv(self.metadata_file, encoding=self.encoding,
+                            header=self.header, sep=self.separation)
+        frame = frame.where(pd.notnull(frame), None)
+        frame = frame.replace({np.nan: None})
+        frame = frame.rename(columns=Deduper())
+        return frame