Skip to content

Commit

Permalink
Introduce some simple filters, refs #189
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Jan 23, 2025
1 parent 2fbb8c9 commit a84fc2d
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 9 deletions.
32 changes: 25 additions & 7 deletions nomenklatura/enrich/wikidata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import cast, Generator, Any, Dict, Optional, Set
from followthemoney.helpers import check_person_cutoff
from rigour.ids.wikidata import is_qid
from fingerprints import clean_brackets

from nomenklatura.entity import CE
from nomenklatura.dataset import DS
Expand All @@ -18,17 +17,14 @@
PROPS_TOPICS,
)
from nomenklatura.enrich.wikidata.model import Claim, Item
from nomenklatura.enrich.wikidata.value import is_alias_strong, clean_name
from nomenklatura.enrich.common import Enricher, EnricherConfig

WD_API = "https://www.wikidata.org/w/api.php"
LABEL_PREFIX = "wd:lb:"
log = logging.getLogger(__name__)


def clean_name(name: str) -> str:
return clean_brackets(name).strip()


class WikidataEnricher(Enricher[DS]):
def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig):
super().__init__(dataset, cache, config)
Expand Down Expand Up @@ -244,18 +240,27 @@ def item_proxy(self, ref: CE, item: Item, schema: str = "Person") -> Optional[CE
proxy.id = item.id
if item.modified is None:
return None
proxy.add("modifiedAt", item.modified)
# proxy.add("modifiedAt", item.modified)
proxy.add("wikidataId", item.id)
names: Set[str] = set()
for label in item.labels:
label.apply(proxy, "name", clean=clean_name)
names.add(label.text.lower())
item.description.apply(proxy, "notes")
for alias in item.aliases:
alias.apply(proxy, "alias", clean=clean_name)
if alias.text is None or alias.text.lower() in names:
continue
_strong = is_alias_strong(alias.text, names)
prop = "alias" if _strong else "weakAlias"
alias.apply(proxy, prop, clean=clean_name)
if _strong:
names.add(alias.text.lower())

if proxy.schema.is_a("Person") and not item.is_instance("Q5"):
log.debug("Person is not a Q5 [%s]: %s", item.id, item.labels)
return None

names_concat = " ".join(names)
for claim in item.claims:
if claim.property is None:
continue
Expand All @@ -266,6 +271,19 @@ def item_proxy(self, ref: CE, item: Item, schema: str = "Person") -> Optional[CE
log.info("Entity %s does not have property: %s", proxy.id, ftm_prop)
continue
value = claim.text(self)

# Sanity check that the name parts are in any of the full names:
if ftm_prop in ("firstName", "lastName", "fatherName"):
if value.text.lower() not in names_concat:
continue

# Make sure the aliases look like the main name, otherwise mark them as weak:
if ftm_prop == "alias":
if value.text is None or value.text.lower() in names:
continue
_strong = is_alias_strong(value.text, names)
ftm_prop = "alias" if _strong else "weakAlias"

if ftm_prop in PROPS_QUALIFIED:
value = qualify_value(self, value, claim)
if ftm_prop == "topics":
Expand Down
19 changes: 18 additions & 1 deletion nomenklatura/enrich/wikidata/value.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from prefixdate import Precision
from typing import TYPE_CHECKING, cast, Any, Dict, Optional
from typing import TYPE_CHECKING, Set, cast, Any, Dict, Optional
from fingerprints import clean_brackets
from rigour.ids.wikidata import is_qid
# from rigour.text.distance import is_levenshtein_plausible

from nomenklatura.dataset import DS
from nomenklatura.enrich.wikidata.lang import LangText
Expand Down Expand Up @@ -62,3 +64,18 @@ def snak_value_to_string(
else:
log.warning("Unhandled value [%s]: %s", value_type, value)
return LangText(None)


def clean_name(name: str) -> str:
return clean_brackets(name).strip()


def is_alias_strong(alias: str, names: Set[str]) -> bool:
"""Check if an alias is a plausible nickname for a person, ie. shows some
similarity to the actual name."""
if " " not in alias:
return False
# for name in names:
# if is_levenshtein_plausible(alias, name, max_edits=None, max_percent=0.7):
# return True
return True
6 changes: 5 additions & 1 deletion tests/enrich/test_wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ def test_wikidata_match():
ent = CompositeEntity.from_data(dataset, data)
results = list(enricher.match(ent))
assert len(results) == 1, results
assert results[0].id == "Q7747", results[0]
res0 = results[0]
assert res0.id == "Q7747", res0
assert "Putin" in res0.get("weakAlias")
assert "Vladimir Vladimirovich Putin" in res0.get("alias")
assert "Vladimir" in res0.get("firstName")


def test_wikidata_enrich():
Expand Down

0 comments on commit a84fc2d

Please sign in to comment.