From 14ec92a7e90629500339828b16773dde33ec8c24 Mon Sep 17 00:00:00 2001 From: Nils Herrmann Date: Tue, 10 Sep 2024 13:53:09 +0200 Subject: [PATCH] #353 Second draft with ObjectRetrieval API --- .../sciencedirect/object_metadata.py | 77 +++++++++++++++++++ .../sciencedirect/object_retrieval.py | 39 +++++++--- pybliometrics/utils/get_content.py | 6 +- 3 files changed, 110 insertions(+), 12 deletions(-) create mode 100644 pybliometrics/sciencedirect/object_metadata.py diff --git a/pybliometrics/sciencedirect/object_metadata.py b/pybliometrics/sciencedirect/object_metadata.py new file mode 100644 index 0000000..9b6e533 --- /dev/null +++ b/pybliometrics/sciencedirect/object_metadata.py @@ -0,0 +1,77 @@ +from typing import Optional, Union + +from pybliometrics.superclasses import Retrieval +from pybliometrics.utils import ( + chained_get, + check_parameter_value, + detect_id_type, + VIEWS, +) + +class ObjectMetadata(Retrieval): + """Class to retrieve a the metadata of all objects of a document""" + @property + def results(self) -> list[str]: + """List with metadata of objects in a document. The metadata includes the `url`, `eid`, + `ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object. + """ + refs = chained_get(self._json, ['attachment-metadata-response', 'attachment']) + out = [] + for ref in refs: + out.append({'url': ref.get('prism:url'), + 'eid': ref.get('eid'), + 'ref': ref.get('ref'), + 'filename': ref.get('filename'), + 'mimetype': ref.get('mimetype'), + 'size': ref.get('size'), + 'height': ref.get('height'), + 'width': ref.get('width'), + 'type': ref.get('type')}) + return out + + + def __init__(self, + identifier: Union[int, str], + view: str = 'META', + id_type: Optional[str] = None, + refresh: Union[bool, int] = False, + **kwds: str + ): + """Class to retrieve the metadata of all objects of a document. + + :param identifier: The indentifier of an article. + :param view: The view of the object. Allowed values: META. + :param id_type: The type of identifier supplied. Allowed values: doi, pii, scopus_id, + pubmed_id, eid. + :param refresh: Whether to refresh the cached file if it exists. Default: False. + """ + self.identifier = str(identifier) + check_parameter_value(view, VIEWS['ObjectRetrieval'], "view") + + self.id_type = id_type + if id_type is None: + self.id_type = detect_id_type(identifier) + else: + allowed_id_types = ('doi', 'pii', 'scopus_id', 'pubmed_id', 'eid') + check_parameter_value(id_type, allowed_id_types, "id_type") + + self._view = view + self._refresh = refresh + + super().__init__(self.identifier, 'ObjectRetrieval', self.id_type) + + + def _find_mime_type(self, reference: str) -> str: + """Auxiliary function to find the MIME type of a specific object""" + for ref in self.object_references: + if ref['ref'] == reference: + return ref['mimetype'] + raise ValueError(f"Reference {reference} not found in object references.") + + + def _find_url(self, reference: str) -> str: + """Auxiliary function to find the ULR of a specific object""" + for ref in self.object_references: + if ref['ref'] == reference: + return ref['url'] + raise ValueError(f"Url {reference} not found in object references.") diff --git a/pybliometrics/sciencedirect/object_retrieval.py b/pybliometrics/sciencedirect/object_retrieval.py index 4980726..6585b00 100644 --- a/pybliometrics/sciencedirect/object_retrieval.py +++ b/pybliometrics/sciencedirect/object_retrieval.py @@ -10,12 +10,19 @@ ) class SpecificObjectRetrieval(Retrieval): - def __init__(self, + """Class to retrieve a specific object of a document.""" + def __init__(self, identifier: str, mime_type: str, id_type: str, refresh: Union[bool, int] = False): - """Class to retrieve a specific object of a document.""" + """Class to retrieve a specific object of a document. + + :param identifier: The identifier of the object. + :param mime_type: The MIME type of the object. + :param id_type: The type of identifier supplied. Allowed values: `doi`, `pii`, `scopus_id`, `pubmed_id`, `eid`. + :param refresh: Whether to refresh the cached file if it exists. Default: False. + """ self._view = '' self._refresh = refresh super().__init__(identifier, @@ -26,6 +33,7 @@ def __init__(self, class ObjectRetrieval(Retrieval): + """Class to retrieve the object metadata of a document.""" def get_specific_object(self, ref: str, mime_type: Optional[str] = None, @@ -34,17 +42,19 @@ def get_specific_object(self, """Retrieves a specific object of a document. :param ref: The reference of the object. This is the `ref` field in the object_references. - :param mime_type: The MIME type of the object. If not supplied, it will be determined automatically. - :param img_type: Can be used ff the object is an image. The type of image to retrieve. Allowed values: `thumbnail`, `standard`, `high`. + :param mime_type: The MIME type of the object. If not supplied, it will be + determined automatically. + :param img_type: Can be used ff the object is an image. The type of image to retrieve. + Allowed values: `thumbnail`, `standard`, `high`. """ if not mime_type: mime_type = self._find_mime_type(ref) - + if not img_type: - identifier = f'{self.identifier}/ref/{ref}/standard' + identifier = f'{self.identifier}/ref/{ref}' else: - identifier = f'{self.identifier}/ref/{ref}/high' - + identifier = f'{self.identifier}/ref/{ref}/{img_type}' + spe_obj = SpecificObjectRetrieval(identifier=identifier, mime_type=mime_type, id_type=self.id_type, @@ -53,7 +63,7 @@ def get_specific_object(self, @property - def object_references(self): + def object_references(self) -> list[str]: """List with metadata of objects in a document. The metadata includes the `url`, `eid`, `ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object. """ @@ -102,11 +112,18 @@ def __init__(self, super().__init__(self.identifier, 'ObjectRetrieval', self.id_type) - def _find_mime_type(self, reference): + def _find_mime_type(self, reference: str) -> str: """Auxiliary function to find the MIME type of a specific object""" for ref in self.object_references: if ref['ref'] == reference: return ref['mimetype'] raise ValueError(f"Reference {reference} not found in object references.") - + + def _find_url(self, reference: str) -> str: + """Auxiliary function to find the MIME type of a specific object""" + for ref in self.object_references: + if ref['ref'] == reference: + return ref['url'] + raise ValueError(f"Url {reference} not found in object references.") + diff --git a/pybliometrics/utils/get_content.py b/pybliometrics/utils/get_content.py index e754a8d..9d71c0d 100644 --- a/pybliometrics/utils/get_content.py +++ b/pybliometrics/utils/get_content.py @@ -1,7 +1,8 @@ from typing import Type -from requests import Session +from requests import exceptions, Session from requests.adapters import HTTPAdapter from urllib3.util import Retry +import xml.etree.ElementTree as ET from pybliometrics import __version__ from pybliometrics import exception @@ -122,6 +123,9 @@ def get_content(url, api, params=None, **kwds): reason = resp.json()['message'] except: reason = "" + except exceptions.JSONDecodeError: + root = ET.fromstring(resp.content) + reason = root.find('.//statusText').text raise errors[resp.status_code](reason) except KeyError: resp.raise_for_status()