Skip to content

Commit

Permalink
Merge pull request #54 from NASA-PDS/registry-api-349
Browse files Browse the repository at this point in the history
New repairkit sweeper for updating metadata loaded with older versions of Harvest to use lists instead of single-value strings
  • Loading branch information
jordanpadams authored Aug 16, 2023
2 parents 2d80cef + 5c19915 commit b88f8f0
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 1 deletion.
4 changes: 3 additions & 1 deletion docker/sweepers_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
from datetime import datetime
from typing import Callable

from pds.registrysweepers import provenance, ancestry
from pds.registrysweepers import provenance, ancestry, repairkit
from pds.registrysweepers.utils import configure_logging, get_human_readable_elapsed_since, parse_log_level

configure_logging(filepath=None, log_level=logging.INFO)
Expand Down Expand Up @@ -108,10 +108,12 @@ def run_factory(sweeper_f: Callable) -> Callable:

run_provenance = run_factory(provenance.run)
run_ancestry = run_factory(ancestry.run)
run_repairkit = run_factory(repairkit.run)

log.info('Running sweepers')
execution_begin = datetime.now()

run_repairkit()
run_provenance()
run_ancestry()

Expand Down
72 changes: 72 additions & 0 deletions src/pds/registrysweepers/repairkit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""repairkit is an executable package
The reason repairkit is an executable package is for extension as new repairs
are needed in the future. They can be added by updating the REPAIR_TOOLS mapping
with the new field name and functional requirements. All the additions can then
be modules with this executable package.
"""
import logging
import re
from typing import Union

from pds.registrysweepers.utils import configure_logging
from pds.registrysweepers.utils import Host
from pds.registrysweepers.utils import query_registry_db
from pds.registrysweepers.utils import write_updated_docs

from . import allarrays

"""
dictionary repair tools is {field_name:[funcs]} where field_name can be:
1: re.compile().fullmatch for the equivalent of "fred" == "fred"
2: re.compile().match for more complex matching of subparts of the string
and funcs are:
def function_name (document:{}, fieldname:str)->{}
and the return an empty {} if no changes and {fieldname:new_value} for repairs
Examples
re.compile("^ops:Info/.+").match("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/.+").fullmatch("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/").match("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/").fullmatch("ops:Info/ops:filesize")->None
To get str_a == str_b, re.compile(str_a).fullmatch
"""

REPAIR_TOOLS = {
re.compile("^ops:Data_File_Info/").match: [allarrays.repair],
re.compile("^ops:Label_File_Info/").match: [allarrays.repair],
}

log = logging.getLogger(__name__)


def run(
base_url: str,
username: str,
password: str,
verify_host_certs: bool = True,
log_filepath: Union[str, None] = None,
log_level: int = logging.INFO,
):
configure_logging(filepath=log_filepath, log_level=log_level)
log.info("starting CLI processing")
host = Host(password, base_url, username, verify_host_certs)
for document in query_registry_db(host, {"match_all": {}}, {}):
id = document["_id"]
src = document["_source"]
repairs = {}
log.debug(f"working on document: {id}")
for fieldname, data in src.items():
for regex, funcs in REPAIR_TOOLS.items():
if regex(fieldname):
for func in funcs:
repairs.update(func(src, fieldname))
if repairs:
log.info(f"Writing repairs to document: {id}")
write_updated_docs(host, {id: repairs})
return
13 changes: 13 additions & 0 deletions src/pds/registrysweepers/repairkit/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pds.registrysweepers.repairkit import run
from pds.registrysweepers.utils import parse_args

args = parse_args(description="sweep through the registry documents and fix common errors")

run(
base_url=args.base_URL,
username=args.username,
password=args.password,
verify_host_certs=not args.insecure,
log_level=args.log_level,
log_filepath=args.log_file,
)
13 changes: 13 additions & 0 deletions src/pds/registrysweepers/repairkit/allarrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""change single strings to array of strings"""
import logging
from typing import Dict

log = logging.getLogger(__name__)


def repair(document: Dict, fieldname: str) -> Dict:
log.debug(f"checking {fieldname}")
if isinstance(document[fieldname], str):
log.info(f"found string for {fieldname} where it should be an array")
return {fieldname: [document[fieldname]]}
return {}
Empty file.
19 changes: 19 additions & 0 deletions tests/pds/registrysweepers/repairkit/test_allarrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import unittest

from pds.registrysweepers.repairkit import allarrays


class AllArrays(unittest.TestCase):
def test_valid_field(self):
src = {"apple": ["orange"]}
repair = allarrays.repair(src, "apple")
self.assertEqual({}, repair)

def test_invalid_field(self):
src = {"apple": "orange"}
repair = allarrays.repair(src, "apple")
self.assertEqual({"apple": ["orange"]}, repair)


if __name__ == "__main__":
unittest.main()

0 comments on commit b88f8f0

Please sign in to comment.