pybliometrics-dev#353 Second draft with ObjectRetrieval API

nils-herrmann · Sep 10, 2024 · 14ec92a · 14ec92a
1 parent 57ee8c5
commit 14ec92a
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 12 deletions.
diff --git a/pybliometrics/sciencedirect/object_metadata.py b/pybliometrics/sciencedirect/object_metadata.py
@@ -0,0 +1,77 @@
+from typing import Optional, Union
+
+from pybliometrics.superclasses import Retrieval
+from pybliometrics.utils import (
+    chained_get,
+    check_parameter_value,
+    detect_id_type,
+    VIEWS,
+)
+
+class ObjectMetadata(Retrieval):
+    """Class to retrieve a the metadata of all objects of a document"""
+    @property
+    def results(self) -> list[str]:
+        """List with metadata of objects in a document. The metadata includes the `url`, `eid`,
+        `ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object.
+        """
+        refs = chained_get(self._json, ['attachment-metadata-response', 'attachment'])
+        out = []
+        for ref in refs:
+            out.append({'url': ref.get('prism:url'),
+                        'eid': ref.get('eid'),
+                        'ref': ref.get('ref'),
+                        'filename': ref.get('filename'),
+                        'mimetype': ref.get('mimetype'),
+                        'size': ref.get('size'),
+                        'height': ref.get('height'),
+                        'width': ref.get('width'),
+                        'type': ref.get('type')})
+        return out
+
+
+    def __init__(self,
+                 identifier: Union[int, str],
+                 view: str = 'META',
+                 id_type: Optional[str] = None,
+                 refresh: Union[bool, int] = False,
+                 **kwds: str
+                 ):
+        """Class to retrieve the metadata of all objects of a document.
+
+        :param identifier: The indentifier of an article.
+        :param view: The view of the object. Allowed values: META.
+        :param id_type: The type of identifier supplied. Allowed values: doi, pii, scopus_id,
+        pubmed_id, eid.
+        :param refresh: Whether to refresh the cached file if it exists. Default: False.
+        """
+        self.identifier = str(identifier)
+        check_parameter_value(view, VIEWS['ObjectRetrieval'], "view")
+
+        self.id_type = id_type
+        if id_type is None:
+            self.id_type = detect_id_type(identifier)
+        else:
+            allowed_id_types = ('doi', 'pii', 'scopus_id', 'pubmed_id', 'eid')
+            check_parameter_value(id_type, allowed_id_types, "id_type")
+
+        self._view = view
+        self._refresh = refresh
+
+        super().__init__(self.identifier, 'ObjectRetrieval', self.id_type)
+
+
+    def _find_mime_type(self, reference: str) -> str:
+        """Auxiliary function to find the MIME type of a specific object"""
+        for ref in self.object_references:
+            if ref['ref'] == reference:
+                return ref['mimetype']
+        raise ValueError(f"Reference {reference} not found in object references.")
+
+
+    def _find_url(self, reference: str) -> str:
+        """Auxiliary function to find the ULR of a specific object"""
+        for ref in self.object_references:
+            if ref['ref'] == reference:
+                return ref['url']
+        raise ValueError(f"Url {reference} not found in object references.")
diff --git a/pybliometrics/sciencedirect/object_retrieval.py b/pybliometrics/sciencedirect/object_retrieval.py
@@ -10,12 +10,19 @@
 )
 
 class SpecificObjectRetrieval(Retrieval):
-    def __init__(self, 
+    """Class to retrieve a specific object of a document."""
+    def __init__(self,
                  identifier: str,
                  mime_type: str,
                  id_type: str,
                  refresh: Union[bool, int] = False):
-        """Class to retrieve a specific object of a document."""
+        """Class to retrieve a specific object of a document.
+
+        :param identifier: The identifier of the object.
+        :param mime_type: The MIME type of the object.
+        :param id_type: The type of identifier supplied. Allowed values: `doi`, `pii`, `scopus_id`, `pubmed_id`, `eid`.
+        :param refresh: Whether to refresh the cached file if it exists. Default: False.
+        """
         self._view = ''
         self._refresh = refresh
         super().__init__(identifier,
@@ -26,6 +33,7 @@ def __init__(self,
 
 
 class ObjectRetrieval(Retrieval):
+    """Class to retrieve the object metadata of a document."""
     def get_specific_object(self,
                             ref: str,
                             mime_type: Optional[str] = None,
@@ -34,17 +42,19 @@ def get_specific_object(self,
         """Retrieves a specific object of a document.
 
         :param ref: The reference of the object. This is the `ref` field in the object_references.
-        :param mime_type: The MIME type of the object. If not supplied, it will be determined automatically.
-        :param img_type: Can be used ff the object is an image. The type of image to retrieve. Allowed values: `thumbnail`, `standard`, `high`.
+        :param mime_type: The MIME type of the object. If not supplied, it will be 
+        determined automatically.
+        :param img_type: Can be used ff the object is an image. The type of image to retrieve.
+        Allowed values: `thumbnail`, `standard`, `high`.
         """
         if not mime_type:
             mime_type = self._find_mime_type(ref)
-        
+
         if not img_type:
-            identifier = f'{self.identifier}/ref/{ref}/standard'
+            identifier = f'{self.identifier}/ref/{ref}'
         else:
-            identifier = f'{self.identifier}/ref/{ref}/high'
-        
+            identifier = f'{self.identifier}/ref/{ref}/{img_type}'
+
         spe_obj = SpecificObjectRetrieval(identifier=identifier,
                                           mime_type=mime_type,
                                           id_type=self.id_type,
@@ -53,7 +63,7 @@ def get_specific_object(self,
 
 
     @property
-    def object_references(self):
+    def object_references(self) -> list[str]:
         """List with metadata of objects in a document. The metadata includes the `url`, `eid`,
         `ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object.
         """
@@ -102,11 +112,18 @@ def __init__(self,
         super().__init__(self.identifier, 'ObjectRetrieval', self.id_type)
 
 
-    def _find_mime_type(self, reference):
+    def _find_mime_type(self, reference: str) -> str:
         """Auxiliary function to find the MIME type of a specific object"""
         for ref in self.object_references:
             if ref['ref'] == reference:
                 return ref['mimetype']
         raise ValueError(f"Reference {reference} not found in object references.")
 
-
+
+    def _find_url(self, reference: str) -> str:
+        """Auxiliary function to find the MIME type of a specific object"""
+        for ref in self.object_references:
+            if ref['ref'] == reference:
+                return ref['url']
+        raise ValueError(f"Url {reference} not found in object references.")
+
diff --git a/pybliometrics/utils/get_content.py b/pybliometrics/utils/get_content.py
@@ -1,7 +1,8 @@
 from typing import Type
-from requests import Session
+from requests import exceptions, Session
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
+import xml.etree.ElementTree as ET
 
 from pybliometrics import __version__
 from pybliometrics import exception
@@ -122,6 +123,9 @@ def get_content(url, api, params=None, **kwds):
                 reason = resp.json()['message']
             except:
                 reason = ""
+        except exceptions.JSONDecodeError:
+            root = ET.fromstring(resp.content)
+            reason = root.find('.//statusText').text
         raise errors[resp.status_code](reason)
     except KeyError:
         resp.raise_for_status()