Revert "Get rid of trying to sync preferred label algorithm."

This reverts commit 67a46c9.
TranslatorSRI · Nov 7, 2024 · d09c560 · d09c560
1 parent 67a46c9
commit d09c560
Showing 1 changed file with 27 additions and 21 deletions.
diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
@@ -708,27 +708,33 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # HOWEVER, there are three reasons not to do that here:
-    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
-    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
-    #    goes with the identifier we've normalized to, so we should probably go with that label
-    #    unless that would be annoying (e.g. if it's very long).
-    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
-    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
-    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
-    #    the first preferred label in order of conflation. Which is what we should be doing, but by
-    #    this point we've lost track of each subclique that went into this conflated clique.
-    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
-    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
-    #    Babel, but that will require some large changes to NodeNorm.
-    #
-    # For these reasons, I'm going to try to replace this with a simplified algorithm:
-    # - Order labels in clique identifier order.
-    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
-    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
-    #
-    # Step 1. Get all possible labels.
-    possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # Note that types[canonical_id] goes from most specific to least specific, so we
+    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
+    # specific type.
+    possible_labels = []
+    for typ in types[canonical_id][::-1]:
+        if typ in config['preferred_name_boost_prefixes']:
+            # This is the most specific matching type, so we use this and then break.
+            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
+                                  sort_identifiers_with_boosted_prefixes(
+                                      eids,
+                                      config['preferred_name_boost_prefixes'][typ]
+                                  )))
+
+            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
+            for eid in eids:
+                label = eid.get('l', '')
+                if label not in possible_labels:
+                    possible_labels.append(label)
+
+            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
+            # categories: so let's break here.
+            break
+
+    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
+    # Biolink prefix order.
+    if not possible_labels:
+        possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if