Skip to content

Commit

Permalink
Merge pull request #443 from MITLibraries/rdi-176-add-list-method-option
Browse files Browse the repository at this point in the history
Add method option to harvest command
  • Loading branch information
hakbailey authored Jul 12, 2022
2 parents f4df5d3 + 7a91b16 commit 5c9d22a
Show file tree
Hide file tree
Showing 9 changed files with 4,868 additions and 3,536 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ If you have your credentials stored locally, you can omit the passed params like

## Required ENV
`SENTRY_DSN` = If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.
`STATUS_UPDATE_INTERVAL` = The transform process logs the # of records transformed every nth record (1000 by default). Set this env variable to any integer to change the frequency of logging status updates. Can be useful for development/debugging.
`WORKSPACE` = Set to `dev` for local development, this will be set to `stage` and `prod` in those environments by Terraform.
32 changes: 21 additions & 11 deletions harvester/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ def main(ctx, host, output_file, verbose):


@main.command()
@click.option(
"--method",
default="list",
show_default=True,
help="Record retrieval method to use. Default 'list' method is faster and should "
"be used in most cases; 'get' method should be used for ArchivesSpace due to "
"errors retrieving a full record set with the 'list' method.",
type=click.Choice(["get", "list"], case_sensitive=False),
)
@click.option(
"-m",
"--metadata-format",
Expand Down Expand Up @@ -78,36 +87,37 @@ def main(ctx, host, output_file, verbose):
is_flag=True,
)
@click.pass_context
def harvest(ctx, metadata_format, from_date, until_date, set_spec, exclude_deleted):
def harvest(
ctx, method, metadata_format, from_date, until_date, set_spec, exclude_deleted
):
"""Harvest records from an OAI-PMH compliant source and write to an output file."""
logger.info(
"OAI-PMH harvesting from source %s with parameters: metadata_format=%s, "
"from_date=%s, until_date=%s, set=%s, exclude_deleted=%s",
"OAI-PMH harvesting from source %s with parameters: method=%s, "
"metadata_format=%s, from_date=%s, until_date=%s, set=%s, exclude_deleted=%s",
ctx.obj["HOST"],
method,
metadata_format,
from_date,
until_date,
set_spec,
exclude_deleted,
)

oai_client = OAIClient(
ctx.obj["HOST"], metadata_format, from_date, until_date, set_spec
)
records = oai_client.retrieve_records(method, exclude_deleted=exclude_deleted)

logger.info("Writing records to output file: %s", ctx.obj["OUTPUT_FILE"])
try:
identifiers = oai_client.get_identifiers()
count = write_records(records, ctx.obj["OUTPUT_FILE"])
except NoRecordsMatch:
logger.error(
"No records harvested: the combination of the provided options results in "
"an empty list."
)
sys.exit()
logger.info(
"Number of records to harvest (including deleted records): %d",
len(identifiers),
)
records = oai_client.get_records(identifiers, exclude_deleted)
logger.info("Writing records to output file: %s", ctx.obj["OUTPUT_FILE"])
count = write_records(records, ctx.obj["OUTPUT_FILE"])

logger.info(
"Harvest completed. Total records harvested (%sincluding deleted records): %d",
"not " if exclude_deleted else "",
Expand Down
62 changes: 41 additions & 21 deletions harvester/oai.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""oai.py module."""
import json
import logging
from typing import Iterator, Optional
import os
from typing import Iterator, Literal, Optional

import smart_open
from sickle import Sickle
from sickle.models import Record
from sickle.oaiexceptions import IdDoesNotExist

from harvester.config import DEFAULT_RETRY_AFTER, MAX_RETRIES, RETRY_STATUS_CODES

Expand Down Expand Up @@ -49,43 +51,61 @@ def _set_params(
params["set"] = set_spec
self.params = params

def get_identifiers(self) -> list[str]:
responses = self.client.ListIdentifiers(**self.params)
return [record.identifier for record in responses]
def get_identifiers(self, exclude_deleted: bool) -> Iterator[str]:
responses = self.client.ListIdentifiers(
ignore_deleted=exclude_deleted, **self.params
)
for record in responses:
yield record.identifier

def get_records(
self, identifiers: list[str], exclude_deleted: bool
) -> Iterator[Record]:
def get_records(self, identifiers: Iterator[str]) -> Iterator[Record]:
for identifier in identifiers:
record = self.client.GetRecord(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug(
"Record retrieved:\n Deleted:%s\n Header:%s\n Raw:%s\n",
record.deleted,
record.header,
record.raw,
)
if exclude_deleted is True and record.deleted is True:
continue
try:
record = self.client.GetRecord(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug("Record retrieved: %s", identifier)
except IdDoesNotExist:
logger.warning(
"Identifier %s retrieved in identifiers list returned 'id does not "
"exist' during getRecord request",
identifier,
)
yield record

def get_sets(self):
responses = self.client.ListSets()
sets = [{"Set name": set.setName, "Set spec": set.setSpec} for set in responses]
return sets

def list_records(self, exclude_deleted: bool) -> Iterator[Record]:
return self.client.ListRecords(ignore_deleted=exclude_deleted, **self.params)

def retrieve_records(
self, method: Literal["get", "list"], exclude_deleted: bool
) -> Iterator[Record]:
if method == "get":
identifiers = self.get_identifiers(exclude_deleted)
return self.get_records(identifiers)
elif method == "list":
return self.list_records(exclude_deleted)
else:
raise ValueError(
'Method must be either "get" or "list", method provided was "{method}"'
)


def write_records(records: Iterator, filepath: str) -> int:
count = 0
with smart_open.open(filepath, "wb") as file:
file.write("<records>\n".encode())
file.write('<?xml version="1.0" encoding="UTF-8"?>\n<records>\n'.encode())
for record in records:
file.write(" ".encode() + record.raw.encode() + "\n".encode())
count += 1
if count % 1000 == 0:
if count % int(os.getenv("STATUS_UPDATE_INTERVAL", "1000")) == 0:
logger.info(
"Status update: %s records written to output file so far!", count
"Status update: %s records written to output file so far!",
count,
)
file.write("</records>".encode())
return count
Expand Down
Loading

0 comments on commit 5c9d22a

Please sign in to comment.