diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ff70603..a00a0b5 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,27 +708,33 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # HOWEVER, there are three reasons not to do that here: - # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique - # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that - # goes with the identifier we've normalized to, so we should probably go with that label - # unless that would be annoying (e.g. if it's very long). - # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes - # conflation in Babel doesn't pick the preferred label across all possible labels within the - # conflated clique, but instead picks the preferred label for each subclique, and then chooses - # the first preferred label in order of conflation. Which is what we should be doing, but by - # this point we've lost track of each subclique that went into this conflated clique. - # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code - # in Babel -- the ideal solution here would be to use the preferred_name being generated by - # Babel, but that will require some large changes to NodeNorm. - # - # For these reasons, I'm going to try to replace this with a simplified algorithm: - # - Order labels in clique identifier order. - # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. - # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. - # - # Step 1. Get all possible labels. - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # Note that types[canonical_id] goes from most specific to least specific, so we + # need to reverse it in order to apply preferred_name_boost_prefixes for the most + # specific type. + possible_labels = [] + for typ in types[canonical_id][::-1]: + if typ in config['preferred_name_boost_prefixes']: + # This is the most specific matching type, so we use this and then break. + possible_labels = list(map(lambda identifier: identifier.get('l', ''), + sort_identifiers_with_boosted_prefixes( + eids, + config['preferred_name_boost_prefixes'][typ] + ))) + + # Add in all the other labels -- we'd still like to consider them, but at a lower priority. + for eid in eids: + label = eid.get('l', '') + if label not in possible_labels: + possible_labels.append(label) + + # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) + # categories: so let's break here. + break + + # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their + # Biolink prefix order. + if not possible_labels: + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if