Skip to content

Commit

Permalink
pybliometrics-dev#353 Second draft with ObjectRetrieval API
Browse files Browse the repository at this point in the history
  • Loading branch information
nils-herrmann committed Sep 10, 2024
1 parent 57ee8c5 commit 14ec92a
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 12 deletions.
77 changes: 77 additions & 0 deletions pybliometrics/sciencedirect/object_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from typing import Optional, Union

from pybliometrics.superclasses import Retrieval
from pybliometrics.utils import (
chained_get,
check_parameter_value,
detect_id_type,
VIEWS,
)

class ObjectMetadata(Retrieval):
"""Class to retrieve a the metadata of all objects of a document"""
@property
def results(self) -> list[str]:
"""List with metadata of objects in a document. The metadata includes the `url`, `eid`,
`ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object.
"""
refs = chained_get(self._json, ['attachment-metadata-response', 'attachment'])
out = []
for ref in refs:
out.append({'url': ref.get('prism:url'),
'eid': ref.get('eid'),
'ref': ref.get('ref'),
'filename': ref.get('filename'),
'mimetype': ref.get('mimetype'),
'size': ref.get('size'),
'height': ref.get('height'),
'width': ref.get('width'),
'type': ref.get('type')})
return out


def __init__(self,
identifier: Union[int, str],
view: str = 'META',
id_type: Optional[str] = None,
refresh: Union[bool, int] = False,
**kwds: str
):
"""Class to retrieve the metadata of all objects of a document.
:param identifier: The indentifier of an article.
:param view: The view of the object. Allowed values: META.
:param id_type: The type of identifier supplied. Allowed values: doi, pii, scopus_id,
pubmed_id, eid.
:param refresh: Whether to refresh the cached file if it exists. Default: False.
"""
self.identifier = str(identifier)
check_parameter_value(view, VIEWS['ObjectRetrieval'], "view")

self.id_type = id_type
if id_type is None:
self.id_type = detect_id_type(identifier)
else:
allowed_id_types = ('doi', 'pii', 'scopus_id', 'pubmed_id', 'eid')
check_parameter_value(id_type, allowed_id_types, "id_type")

self._view = view
self._refresh = refresh

super().__init__(self.identifier, 'ObjectRetrieval', self.id_type)


def _find_mime_type(self, reference: str) -> str:
"""Auxiliary function to find the MIME type of a specific object"""
for ref in self.object_references:
if ref['ref'] == reference:
return ref['mimetype']
raise ValueError(f"Reference {reference} not found in object references.")


def _find_url(self, reference: str) -> str:
"""Auxiliary function to find the ULR of a specific object"""
for ref in self.object_references:
if ref['ref'] == reference:
return ref['url']
raise ValueError(f"Url {reference} not found in object references.")
39 changes: 28 additions & 11 deletions pybliometrics/sciencedirect/object_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,19 @@
)

class SpecificObjectRetrieval(Retrieval):
def __init__(self,
"""Class to retrieve a specific object of a document."""
def __init__(self,
identifier: str,
mime_type: str,
id_type: str,
refresh: Union[bool, int] = False):
"""Class to retrieve a specific object of a document."""
"""Class to retrieve a specific object of a document.
:param identifier: The identifier of the object.
:param mime_type: The MIME type of the object.
:param id_type: The type of identifier supplied. Allowed values: `doi`, `pii`, `scopus_id`, `pubmed_id`, `eid`.
:param refresh: Whether to refresh the cached file if it exists. Default: False.
"""
self._view = ''
self._refresh = refresh
super().__init__(identifier,
Expand All @@ -26,6 +33,7 @@ def __init__(self,


class ObjectRetrieval(Retrieval):
"""Class to retrieve the object metadata of a document."""
def get_specific_object(self,
ref: str,
mime_type: Optional[str] = None,
Expand All @@ -34,17 +42,19 @@ def get_specific_object(self,
"""Retrieves a specific object of a document.
:param ref: The reference of the object. This is the `ref` field in the object_references.
:param mime_type: The MIME type of the object. If not supplied, it will be determined automatically.
:param img_type: Can be used ff the object is an image. The type of image to retrieve. Allowed values: `thumbnail`, `standard`, `high`.
:param mime_type: The MIME type of the object. If not supplied, it will be
determined automatically.
:param img_type: Can be used ff the object is an image. The type of image to retrieve.
Allowed values: `thumbnail`, `standard`, `high`.
"""
if not mime_type:
mime_type = self._find_mime_type(ref)

if not img_type:
identifier = f'{self.identifier}/ref/{ref}/standard'
identifier = f'{self.identifier}/ref/{ref}'
else:
identifier = f'{self.identifier}/ref/{ref}/high'
identifier = f'{self.identifier}/ref/{ref}/{img_type}'

spe_obj = SpecificObjectRetrieval(identifier=identifier,
mime_type=mime_type,
id_type=self.id_type,
Expand All @@ -53,7 +63,7 @@ def get_specific_object(self,


@property
def object_references(self):
def object_references(self) -> list[str]:
"""List with metadata of objects in a document. The metadata includes the `url`, `eid`,
`ref`, `filename`, `mimetype`, `size`, `height`, `width`, and `type` of the object.
"""
Expand Down Expand Up @@ -102,11 +112,18 @@ def __init__(self,
super().__init__(self.identifier, 'ObjectRetrieval', self.id_type)


def _find_mime_type(self, reference):
def _find_mime_type(self, reference: str) -> str:
"""Auxiliary function to find the MIME type of a specific object"""
for ref in self.object_references:
if ref['ref'] == reference:
return ref['mimetype']
raise ValueError(f"Reference {reference} not found in object references.")



def _find_url(self, reference: str) -> str:
"""Auxiliary function to find the MIME type of a specific object"""
for ref in self.object_references:
if ref['ref'] == reference:
return ref['url']
raise ValueError(f"Url {reference} not found in object references.")

6 changes: 5 additions & 1 deletion pybliometrics/utils/get_content.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Type
from requests import Session
from requests import exceptions, Session
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import xml.etree.ElementTree as ET

from pybliometrics import __version__
from pybliometrics import exception
Expand Down Expand Up @@ -122,6 +123,9 @@ def get_content(url, api, params=None, **kwds):
reason = resp.json()['message']
except:
reason = ""
except exceptions.JSONDecodeError:
root = ET.fromstring(resp.content)
reason = root.find('.//statusText').text
raise errors[resp.status_code](reason)
except KeyError:
resp.raise_for_status()
Expand Down

0 comments on commit 14ec92a

Please sign in to comment.