Skip to content

Commit

Permalink
pybliometrics-dev#360 ScienceDirect Object Retrieval with eid
Browse files Browse the repository at this point in the history
  • Loading branch information
nils-herrmann committed Oct 17, 2024
1 parent 1817501 commit af361f1
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 5 deletions.
1 change: 1 addition & 0 deletions pybliometrics/sciencedirect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from pybliometrics.sciencedirect.article_retrieval import *
from pybliometrics.sciencedirect.article_metadata import *
from pybliometrics.sciencedirect.object_metadata import *
from pybliometrics.sciencedirect.object_retrieval import *
from pybliometrics.sciencedirect.sciencedirect_search import *
6 changes: 4 additions & 2 deletions pybliometrics/sciencedirect/article_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ def results(self) -> Optional[List[NamedTuple]]:
authors_list = [author.get('$') for author in chained_get(item, ['authors', 'author'], [])]
authors_list = deduplicate(authors_list)
authors = ';'.join(authors_list)
first_author = item.get('dc:creator')[0].get('$')
link = item.get('link')[0].get('@href')
first_author = item.get('dc:creator')
first_author = first_author[0].get('$') if first_author else None
link = item.get('link')
link = link[0].get('@href') if link else None
doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None
new = doc(authorKeywords=item.get('authkeywords'),
authors=authors,
Expand Down
56 changes: 56 additions & 0 deletions pybliometrics/sciencedirect/object_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from io import BytesIO
from typing import Literal, Optional, Union

from pybliometrics.sciencedirect import ArticleRetrieval
from pybliometrics.superclasses import Retrieval
from pybliometrics.utils import (
chained_get,
check_parameter_value,
detect_id_type,
VIEWS,
)

class ObjectRetrieval(Retrieval):
@property
def object(self) -> BytesIO:
"""The object retrieved."""
return self._object

def __init__(self,
identifier: Union[int, str],
filename: str,
id_type: Optional[str] = None,
refresh: Union[bool, int] = False,
**kwds: str
):
"""Class to retrieve a specific object of a document.
:param identifier: The indentifier of the document.
:param filename: Filename of the object to be retrieved. To get a list of all available
objects of a document (and its corresponding filename) use the class `ObjectMetadata`.
:param id_type: The type of identifier supplied. Allowed values: doi, pii, scopus_id, pubmed_id, eid.
:param refresh: Whether to refresh the cached file if it exists. Default: False.
"""
identifier = str(identifier)

if id_type is None:
id_type = detect_id_type(identifier)
else:
allowed_id_types = ('doi', 'pii', 'scopus_id', 'pubmed_id', 'eid')
check_parameter_value(id_type, allowed_id_types, "id_type")

if id_type != 'eid':
identifier = self._get_eid(identifier)
file_identifier = f'{identifier}-{filename}'

self._view = ''
self._refresh = refresh

super().__init__(file_identifier, 'ObjectRetrieval', 'eid', **kwds)

self._object = BytesIO(self._object)

def _get_eid(self, identifier: str) -> str:
"""Get the EID of a document."""
am = ArticleRetrieval(identifier, field='eid')
return am.eid
38 changes: 38 additions & 0 deletions pybliometrics/sciencedirect/tests/test_ObjectRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Test class ObjectRetrieval"""
import xml.etree.ElementTree as ET

from io import BytesIO
from PIL import Image

from pybliometrics.sciencedirect import init, ObjectRetrieval

init()

or_1 = ObjectRetrieval('S156984322300331X',
'gr10.jpg',
refresh=30)
or_2 = ObjectRetrieval('10.1016/j.rcim.2020.102086',
'si92.svg',
id_type='doi',
refresh=30)


def test_object():
"""Tests whether the object is a BytesIO object."""
assert isinstance(or_1.object, BytesIO)
assert isinstance(or_2.object, BytesIO)


def test_is_jpg():
"""Tests whether the object is a JPEG image."""
obj_1 = or_1.object
with Image.open(obj_1) as img:
assert img.format.lower() == 'jpeg'


def test_is_svg():
"""Tests whether the object is an SVG image."""
obj_2 = or_2.object
tree = ET.parse(obj_2)
root = tree.getroot()
assert root.tag == '{http://www.w3.org/2000/svg}svg'
14 changes: 12 additions & 2 deletions pybliometrics/superclasses/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,17 @@ def __init__(self,
search_request = "query" in params
# Check if ref retrieval for abstract
ab_ref_retrieval = (api == 'AbstractRetrieval') and (params['view'] == 'REF')
# Check if object retrieval
obj_retrieval = (api == 'ObjectRetrieval')

if fname.exists() and not self._refresh:
self._mdate = mod_ts
if search_request:
self._json = [loads(line) for line in
fname.read_text().split("\n") if line]
self._n = len(self._json)
elif obj_retrieval:
self._object = fname.read_bytes()
else:
self._json = loads(fname.read_text())
else:
Expand Down Expand Up @@ -111,6 +115,9 @@ def __init__(self,
self._json = data
else:
data = None
elif obj_retrieval:
self._object = resp.content
data = []
else:
data = loads(resp.text)
self._json = data
Expand All @@ -120,8 +127,11 @@ def __init__(self,
self._header = header
# Finally write data unless download=False
if download:
text = [dumps(item, separators=(',', ':')) for item in data]
fname.write_text("\n".join(text))
if obj_retrieval:
fname.write_bytes(self._object)
else:
text = [dumps(item, separators=(',', ':')) for item in data]
fname.write_text("\n".join(text))

def get_cache_file_age(self) -> int:
"""Return the age of the cached file in days."""
Expand Down
2 changes: 1 addition & 1 deletion pybliometrics/superclasses/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self,
"""
# Construct URL and cache file name
url = URLS[api]
if api in ("AbstractRetrieval", "PlumXMetrics", "ArticleRetrieval", "ObjectMetadata"):
if api in ("AbstractRetrieval", "PlumXMetrics", "ArticleRetrieval", "ObjectMetadata", "ObjectRetrieval"):
url += id_type + "/"
if api == 'CitationOverview':
stem = identifier.replace("/", "")
Expand Down
4 changes: 4 additions & 0 deletions pybliometrics/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'ArticleMetadata': BASE_PATH_SCIENCEDIRECT/'article_metadata/',
'ArticleRetrieval': BASE_PATH_SCIENCEDIRECT/'article_retrieval',
'ObjectMetadata': BASE_PATH_SCIENCEDIRECT/'object_metadata',
'ObjectRetrieval': BASE_PATH_SCIENCEDIRECT/'object_retrieval',
'ScienceDirectSearch': BASE_PATH_SCIENCEDIRECT/'science_direct_search'
}

Expand Down Expand Up @@ -59,6 +60,7 @@
'ArticleMetadata': RETRIEVAL_BASE + 'metadata/article/',
'ArticleRetrieval': RETRIEVAL_BASE + 'article/',
'ObjectMetadata': RETRIEVAL_BASE + 'object/',
'ObjectRetrieval': RETRIEVAL_BASE + 'object/',
'ScienceDirectSearch': SEARCH_BASE + 'sciencedirect/'
}

Expand All @@ -78,6 +80,7 @@
"ArticleRetrieval": ["META", "META_ABS", "META_ABS_REF", "FULL", "ENTITLED"],
"ArticleMetadata": ["STANDARD", "COMPLETE"],
"ObjectMetadata": ["META"],
"ObjectRetrieval": [""],
"ScienceDirectSearch": ["STANDARD"]
}

Expand All @@ -97,6 +100,7 @@
'ArticleMetadata': 6,
'ArticleRetrieval': 10,
'ObjectMetadata': 0,
'ObjectRetrieval': 0,
'ScienceDirectSearch': 2
}

Expand Down

0 comments on commit af361f1

Please sign in to comment.