Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDXDSYS-872 Avoid calling normalise multiple times #48

Merged
merged 1 commit into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ hdx-python-utilities==3.7.2
# via hdx-python-country (pyproject.toml)
humanize==4.9.0
# via frictionless
identify==2.5.36
identify==2.6.0
# via pre-commit
idna==3.7
# via requests
Expand Down
33 changes: 19 additions & 14 deletions src/hdx/location/adminlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ def setup_row(
self.pcodes.append(pcode)
self.pcode_to_name[pcode] = adm_name

adm_name = normalise(adm_name)
name_to_pcode = self.name_to_pcode.get(countryiso3, {})
name_to_pcode[normalise(adm_name)] = pcode
name_to_pcode[adm_name] = pcode
self.name_to_pcode[countryiso3] = name_to_pcode
self.pcode_to_iso3[pcode] = countryiso3
self.pcode_to_iso3[pcode] = countryiso3
Expand All @@ -155,7 +156,7 @@ def setup_row(
countryiso3, {}
)
name_to_pcode = name_parent_to_pcode.get(parent, {})
name_to_pcode[normalise(adm_name)] = pcode
name_to_pcode[adm_name] = pcode
name_parent_to_pcode[parent] = name_to_pcode
self.name_parent_to_pcode[countryiso3] = name_parent_to_pcode
self.pcode_to_parent[pcode] = parent
Expand Down Expand Up @@ -554,13 +555,15 @@ def fuzzy_pcode(
self,
countryiso3: str,
name: str,
normalised_name: str,
**kwargs: Any,
) -> Optional[str]:
"""Fuzzy match name to pcode

Args:
countryiso3 (str): ISO3 country code
name (str): Name to match
normalised_name (str): Normalised name
**kwargs:
parent (Optional[str]): Parent admin code
logname (str): Log using this identifying name. Defaults to not logging.
Expand Down Expand Up @@ -597,21 +600,20 @@ def fuzzy_pcode(
if logname:
self.errors.add((logname, countryiso3, parent))
return None
adm_name_lookup = normalise(name)
adm_name_lookup2 = multiple_replace(
adm_name_lookup,
alt_normalised_name = multiple_replace(
normalised_name,
self.get_admin_name_replacements(countryiso3, parent),
)
pcode = name_to_pcode.get(
adm_name_lookup, name_to_pcode.get(adm_name_lookup2)
normalised_name, name_to_pcode.get(alt_normalised_name)
)
if not pcode and name.lower() in self.admin_fuzzy_dont:
if logname:
self.ignored.add((logname, countryiso3, name))
return None
if not pcode:
for map_name in name_to_pcode:
if adm_name_lookup in map_name:
if normalised_name in map_name:
pcode = name_to_pcode[map_name]
if logname:
self.matches.add(
Expand All @@ -625,7 +627,7 @@ def fuzzy_pcode(
)
break
for map_name in name_to_pcode:
if adm_name_lookup2 in map_name:
if alt_normalised_name in map_name:
pcode = name_to_pcode[map_name]
if logname:
self.matches.add(
Expand Down Expand Up @@ -659,8 +661,8 @@ def al_transform_2(name):

matching_index = self.phonetics.match(
map_names,
adm_name_lookup,
alternative_name=adm_name_lookup2,
normalised_name,
alternative_name=alt_normalised_name,
transform_possible_names=[al_transform_1, al_transform_2],
)

Expand Down Expand Up @@ -754,25 +756,28 @@ def get_pcode(
)
return pcode, True
else:
normalised_name = normalise(name)
if parent:
name_parent_to_pcode = self.name_parent_to_pcode.get(
countryiso3
)
if name_parent_to_pcode:
name_to_pcode = name_parent_to_pcode.get(parent)
if name_to_pcode is not None:
pcode = name_to_pcode.get(name.lower())
pcode = name_to_pcode.get(normalised_name)
if pcode:
return pcode, True
else:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if name_to_pcode is not None:
pcode = name_to_pcode.get(name.lower())
pcode = name_to_pcode.get(normalised_name)
if pcode:
return pcode, True
if not fuzzy_match or len(name) < fuzzy_length:
if not fuzzy_match or len(normalised_name) < fuzzy_length:
return None, True
pcode = self.fuzzy_pcode(countryiso3, name, **kwargs)
pcode = self.fuzzy_pcode(
countryiso3, name, normalised_name, **kwargs
)
return pcode, False

def output_matches(self) -> List[str]:
Expand Down