Skip to content

Commit

Permalink
Get a default resolver, always a db
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Jan 14, 2025
1 parent 1514583 commit 04be164
Showing 1 changed file with 34 additions and 35 deletions.
69 changes: 34 additions & 35 deletions nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,13 @@ def _load_enricher(path: Path) -> Tuple[Dataset, Enricher[Dataset]]:
return dataset, enricher


def get_resolver() -> Resolver[Entity]:
return Resolver[Entity].make_default()


@click.group(help="Nomenklatura data integration")
def cli() -> None:
logging.basicConfig(level=logging.INFO)


@cli.command("xref", help="Generate dedupe candidates")
@click.argument("path", type=InPath)
@click.option("-r", "--resolver", type=ResPath)
@click.option("-a", "--auto-threshold", type=click.FLOAT, default=None)
@click.option("-l", "--limit", type=click.INT, default=5000)
@click.option("--algorithm", default=DefaultAlgorithm.NAME)
Expand All @@ -72,15 +67,14 @@ def cli() -> None:
)
def xref_file(
path: Path,
resolver: Optional[Path] = None,
auto_threshold: Optional[float] = None,
algorithm: str = DefaultAlgorithm.NAME,
limit: int = 5000,
scored: bool = True,
index: str = Index.name,
clear: bool = False,
) -> None:
resolver_ = _get_resolver(path, resolver)
resolver_ = Resolver[Entity].make_default()
store = load_entity_file_store(path, resolver=resolver_)
algorithm_type = get_algorithm(algorithm)
if algorithm_type is None:
Expand All @@ -103,15 +97,15 @@ def xref_file(
limit=limit,
)
resolver_.commit()
log.info("Xref complete in: %s", resolver_.path)
log.info("Xref complete in: %r", resolver_)


@cli.command("prune", help="Remove dedupe candidates")
@click.argument("resolver", type=ResPath)
def xref_prune(resolver: Path) -> None:
resolver_ = _get_resolver(resolver, resolver)
def xref_prune() -> None:
resolver_ = Resolver[Entity].make_default()
resolver_.begin()
resolver_.prune()
resolver_.save()
resolver_.commit()


@cli.command("apply", help="Apply resolver to an entity stream")
Expand All @@ -124,14 +118,16 @@ def xref_prune(resolver: Path) -> None:
default=None,
help="Add a dataset to the entity metadata",
)
@click.option("-r", "--resolver", required=True, type=ResPath)
def apply(
path: Path, outpath: Path, resolver: Optional[Path], dataset: Optional[str] = None
path: Path, outpath: Path, dataset: Optional[str] = None
) -> None:
resolver_ = _get_resolver(path, resolver)
resolver_ = Resolver[Entity].make_default()
resolver_.begin()
linker = resolver_.get_linker()
resolver_.rollback()
with path_writer(outpath) as outfh:
for proxy in path_entities(path, StreamEntity):
proxy = resolver_.apply_stream(proxy)
proxy = linker.apply_stream(proxy)
if dataset is not None:
proxy.datasets.add(dataset)
write_entity(outfh, proxy)
Expand All @@ -156,26 +152,27 @@ def make_sortable(path: Path, outpath: Path) -> None:
@cli.command("dedupe", help="Interactively judge xref candidates")
@click.argument("path", type=InPath)
@click.option("-x", "--xref", is_flag=True, default=False)
@click.option("-r", "--resolver", type=ResPath)
def dedupe(path: Path, xref: bool = False, resolver: Optional[Path] = None) -> None:
resolver_ = _get_resolver(path, resolver)
def dedupe(path: Path, xref: bool = False) -> None:
resolver_ = Resolver[Entity].make_default()
resolver_.begin()
store = load_entity_file_store(path, resolver=resolver_)
if xref:
index_dir = path.parent / INDEX_SEGMENT
run_xref(resolver_, store, index_dir)
resolver_.commit()

dedupe_ui(resolver_, store)
resolver_.save()


@cli.command("merge-resolver", help="Merge resolver configs")
@click.argument("outpath", type=OutPath)
@click.option("-i", "--inputs", type=InPath, multiple=True)
def merge_resolver(outpath: Path, inputs: Iterable[Path]) -> None:
resolver = Resolver[Entity].load(outpath)
resolver = Resolver[Entity].make_default()
resolver.begin()
for path in inputs:
resolver.merge(path)
resolver.save()
resolver.commit()


@cli.command("train-v1-matcher", help="Train a matching model from judgement pairs")
Expand All @@ -194,44 +191,44 @@ def train_v2_matcher_(pairs_file: Path) -> None:
@click.argument("config", type=InPath)
@click.argument("entities", type=InPath)
@click.option("-o", "--outpath", type=OutPath, default="-")
@click.option("-r", "--resolver", type=ResPath)
def match_command(
config: Path,
entities: Path,
outpath: Path,
resolver: Optional[Path],
) -> None:
resolver_ = _get_resolver(entities, resolver)
resolver_ = Resolver[Entity].make_default()
_, enricher = _load_enricher(config)

try:
resolver_.begin()
with path_writer(outpath) as fh:
stream = path_entities(entities, Entity)
for proxy in match(enricher, resolver_, stream):
write_entity(fh, proxy)
finally:
resolver_.save()
resolver_.commit()
enricher.close()


@cli.command("enrich", help="Fetch extra info from an enrichment source")
@click.argument("config", type=InPath)
@click.argument("entities", type=InPath)
@click.option("-o", "--outpath", type=OutPath, default="-") # noqa
@click.option("-r", "--resolver", type=ResPath)
def enrich_command(
config: Path,
entities: Path,
outpath: Path,
resolver: Optional[Path],
) -> None:
resolver_ = _get_resolver(entities, resolver)
resolver_ = Resolver[Entity].make_default()
_, enricher = _load_enricher(config)
try:
resolver_.begin()
with path_writer(outpath) as fh:
stream = path_entities(entities, Entity)
for proxy in enrich(enricher, resolver_, stream):
write_entity(fh, proxy)
finally:
resolver_.commit()
enricher.close()


Expand All @@ -253,15 +250,17 @@ def make_statements() -> Generator[Statement, None, None]:
@click.option("-i", "--infile", type=InPath, default="-")
@click.option("-o", "--outpath", type=OutPath, default="-")
@click.option("-f", "--format", type=click.Choice(FORMATS), default=CSV)
@click.option("-r", "--resolver", required=True, type=ResPath)
def statements_apply(infile: Path, outpath: Path, format: str, resolver: Path) -> None:
resolver_ = _get_resolver(infile, resolver)
def statements_apply(infile: Path, outpath: Path, format: str) -> None:
resolver_ = Resolver[Entity].make_default()
resolver_.begin()
linker = resolver_.get_linker()
resolver_.rollback()

def _generate() -> Generator[Statement, None, None]:
for stmt in read_path_statements(
infile, format=format, statement_type=Statement
):
yield resolver_.apply_statement(stmt)
yield linker.apply_statement(stmt)

with path_writer(outpath) as outfh:
write_statements(outfh, format, _generate())
Expand Down Expand Up @@ -309,7 +308,7 @@ def statements_aggregate(
@cli.command("load-resolver", help="Load file-based resolver info to database")
@click.argument("source", type=InPath)
def load_resolver(source: Path) -> None:
resolver = get_resolver()
resolver = Resolver[Entity].make_default()
resolver.begin()
resolver.load(source)
resolver.commit()
Expand All @@ -318,7 +317,7 @@ def load_resolver(source: Path) -> None:
@cli.command("dump-resolver", help="Load file-based resolver info to database")
@click.argument("target", type=OutPath)
def dump_resolver(target: Path) -> None:
resolver = get_resolver()
resolver = Resolver[Entity].make_default()
resolver.begin()
resolver.save(target)
resolver.commit()
Expand Down

0 comments on commit 04be164

Please sign in to comment.