From d84fa3a795b73012de67a4268967a31a25cdb381 Mon Sep 17 00:00:00 2001 From: Afonso Pinto Date: Wed, 7 Feb 2024 16:48:59 +0000 Subject: [PATCH] Feature/sckan 226 (#202) * SCKAN-226 feat: Update ingestion script * SCKAN-226 chore: Refactor add_destination to match with add_vias * SCKAN-226 feat: Enforce reference_uri uniqueness * SCKAN-226 feat: Update ingestion logic * SCKAN-226 feat: WIP - Update neurondm script * SCKAN-226 feat: WIP - Undo validation neurondm script * SCKAN-226 feat: Use predicates in partial order processing * SCKAN-226 feat: Merge vias that differ only on from_entities * SCKAN-226 feat: Merge destinations * SCKAN-226 feat: Merge destinations by from_entities * SCKAN-226 feat: Merge origins * SCKAN-226 feat: Update ingestion script * SCKAN-226 feat: Test both region and layer on entity identification * SCKAN-226 feat: Update ingestion script * SCKAN-226 feat: Update default state on ingestion creation * SCKAN-226 feat: Add forward connection ingestion * SCKAN-226 feat: WIP - Update validation and add logging service * SCKAN-226 feat: Make ingestion atomic; Update logging service * SCKAN-226 fix: Update forward connection check * SCKAN-226 refactor: Use update_or_create instead of custom method * SCKAN-226 feat: Add new transition draft to exported for ingestion * SCKAN-226 feat: Remove unnecessary transition * SCKAN-248 fix: Update journey tests * SCKAN-248 feat: Add tests to neurondm processing * SCKAN-248 feat: WIP - Ignore partial order mismatches * Revert "SCKAN-226 feat: Remove unnecessary transition" This reverts commit d29ac5a366d3bfb5db285b2c7b52ded9a2e46e02. * SCKAN-226 feat: Use .update and .create instead of .update_or_create * SCKAN-226 refactor: Use .get_or_create instead of filter * SCKAN-226 refactor: Remove __exact from filter * SCKAN-226 refactor: Add has_changes check * SCKAN-226 feat: Ingest inconsistent populations but flag them with warnings * SCKAN-248 feat: Add invalid state; Update ingestion accordingly * SCKAN-248 feat: Optimize upstream algorithm * SCKAN-248 chore: Remove TODO annotation * SCKAN-248 feat: Add note on step 3 even if state was invalid * SCKAN-248 feat: Allow transition to invalid from any state * SCKAN-248 fix: Update process connection algorithm and jump test * SCKAN-248 feat: Update process connection algorithm and multiple axioms test * SCKAN-248 feat: Add partial order vs axioms validation * SCKAN-248 fix: Update has_changes logic * SCKAN-248 feat: Update notes * SCKAN-248 feat: Remove deprecated method --- backend/composer/enums.py | 1 + .../management/commands/ingest_statements.py | 10 +- ...ter_connectivitystatement_reference_uri.py | 17 + backend/composer/models.py | 27 +- .../services/cs_ingestion/__init__.py | 0 .../cs_ingestion/cs_ingestion_services.py | 728 ++++++++++++------ .../services/cs_ingestion/exceptions.py | 6 + .../composer/services/cs_ingestion/helpers.py | 47 ++ .../services/cs_ingestion/logging_service.py | 40 + .../composer/services/cs_ingestion/models.py | 73 ++ .../services/cs_ingestion/neurondm_script.py | 412 +++++++--- backend/composer/services/state_services.py | 5 + backend/run_ingest.sh | 31 + backend/tests/test_neurondm_processing.py | 160 ++++ .../backend/.openapi-generator/FILES | 1 - 15 files changed, 1198 insertions(+), 360 deletions(-) create mode 100644 backend/composer/migrations/0033_alter_connectivitystatement_reference_uri.py create mode 100644 backend/composer/services/cs_ingestion/__init__.py create mode 100644 backend/composer/services/cs_ingestion/exceptions.py create mode 100644 backend/composer/services/cs_ingestion/helpers.py create mode 100644 backend/composer/services/cs_ingestion/logging_service.py create mode 100644 backend/composer/services/cs_ingestion/models.py create mode 100755 backend/run_ingest.sh create mode 100644 backend/tests/test_neurondm_processing.py diff --git a/backend/composer/enums.py b/backend/composer/enums.py index 5ea1b830..829b787c 100644 --- a/backend/composer/enums.py +++ b/backend/composer/enums.py @@ -61,6 +61,7 @@ class CSState(models.TextChoices): CONNECTION_MISSING = "connection_missing" NPO_APPROVED = "npo_approved" EXPORTED = "exported" + INVALID = "invalid" class NoteType(models.TextChoices): diff --git a/backend/composer/management/commands/ingest_statements.py b/backend/composer/management/commands/ingest_statements.py index 866598df..685e3816 100644 --- a/backend/composer/management/commands/ingest_statements.py +++ b/backend/composer/management/commands/ingest_statements.py @@ -5,5 +5,13 @@ class Command(BaseCommand): help = "Ingests Statements from neurondm pyp package" + def add_arguments(self, parser): + parser.add_argument( + '--update_upstream', + action='store_true', + help='Set this flag to update upstream statements.', + ) + def handle(self, *args, **options): - ingest_statements() + update_upstream = options['update_upstream'] + ingest_statements(update_upstream) diff --git a/backend/composer/migrations/0033_alter_connectivitystatement_reference_uri.py b/backend/composer/migrations/0033_alter_connectivitystatement_reference_uri.py new file mode 100644 index 00000000..079ca035 --- /dev/null +++ b/backend/composer/migrations/0033_alter_connectivitystatement_reference_uri.py @@ -0,0 +1,17 @@ +# Generated by Django 4.1.4 on 2024-01-05 15:46 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("composer", "0032_alter_connectivitystatement_projection"), + ] + + operations = [ + migrations.AlterField( + model_name="connectivitystatement", + name="reference_uri", + field=models.URLField(blank=True, null=True, unique=True), + ), + ] diff --git a/backend/composer/models.py b/backend/composer/models.py index 424fcf9a..410e7cd8 100644 --- a/backend/composer/models.py +++ b/backend/composer/models.py @@ -413,7 +413,7 @@ class ConnectivityStatement(models.Model): ) apinatomy_model = models.CharField(max_length=200, null=True, blank=True) additional_information = models.TextField(null=True, blank=True) - reference_uri = models.URLField(null=True, blank=True) + reference_uri = models.URLField(null=True, blank=True, unique=True) functional_circuit_role = models.ForeignKey( FunctionalCircuitRole, on_delete=models.DO_NOTHING, @@ -443,6 +443,7 @@ def __str__(self): CSState.REJECTED, CSState.NPO_APPROVED, CSState.EXPORTED, + CSState.INVALID ], permission=lambda instance, user: ConnectivityStatementService.has_permission_to_transition_to_compose_now( instance, user @@ -500,6 +501,26 @@ def npo_approved(self, *args, **kwargs): def exported(self, *args, **kwargs): ... + @transition( + field=state, + source=[ + CSState.DRAFT, + CSState.COMPOSE_NOW, + CSState.CURATED, + CSState.EXCLUDED, + CSState.REJECTED, + CSState.TO_BE_REVIEWED, + CSState.CONNECTION_MISSING, + CSState.NPO_APPROVED, + CSState.EXPORTED, + ], + target=CSState.INVALID, + permission=lambda instance, user: ConnectivityStatementService.has_permission_to_transition_to_invalid( + instance, user + ), ) + def invalid(self, *args, **kwargs): + ... + @property def export_id(self): return f"CPR:{self.id:06d}" @@ -590,7 +611,7 @@ class Destination(AbstractConnectionLayer): choices=DestinationType.choices, default=DestinationType.UNKNOWN ) - + objects = DestinationManager() class Meta: @@ -604,7 +625,7 @@ class Meta: class Via(AbstractConnectionLayer): anatomical_entities = models.ManyToManyField(AnatomicalEntity, blank=True, related_name='via_connection_layers') - + objects = ViaManager() type = models.CharField( diff --git a/backend/composer/services/cs_ingestion/__init__.py b/backend/composer/services/cs_ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/composer/services/cs_ingestion/cs_ingestion_services.py b/backend/composer/services/cs_ingestion/cs_ingestion_services.py index 288c301a..5f5a7de6 100644 --- a/backend/composer/services/cs_ingestion/cs_ingestion_services.py +++ b/backend/composer/services/cs_ingestion/cs_ingestion_services.py @@ -1,292 +1,544 @@ -from composer.models import AnatomicalEntity, Sentence, ConnectivityStatement, Sex, FunctionalCircuitRole, \ - ProjectionPhenotype, Phenotype, Specie, Provenance, Via, Note, User, Destination import logging from datetime import datetime +from typing import List, Dict, Optional, Tuple, Set, Any + +from django.db import transaction + +from composer.models import AnatomicalEntity, Sentence, ConnectivityStatement, Sex, FunctionalCircuitRole, \ + ProjectionPhenotype, Phenotype, Specie, Provenance, Via, Note, User, Destination +from .helpers import get_value_or_none, found_entity, \ + ORIGINS, DESTINATIONS, VIAS, LABEL, SEX, SPECIES, ID, FORWARD_CONNECTION, SENTENCE_NUMBER, \ + FUNCTIONAL_CIRCUIT_ROLE, CIRCUIT_TYPE, CIRCUIT_TYPE_MAPPING, PHENOTYPE, OTHER_PHENOTYPE, NOTE_ALERT, PROVENANCE, \ + VALIDATION_ERRORS +from .logging_service import LoggerService, STATEMENT_INCORRECT_STATE, SENTENCE_INCORRECT_STATE +from .models import LoggableEvent, ValidationErrors from .neurondm_script import main as get_statements_from_neurondm from ...enums import ( CircuitType, NoteType, - DestinationType, - SentenceState, - CSState + CSState, SentenceState ) -from ..state_services import SentenceService, ConnectivityStatementService - -ID = "id" -ORIGIN = "origin" -DESTINATION = "dest" -VIAS = "path" -LABEL = "label" -SENTENCE_NUMBER = 'sentence_number' -ENTITY_URI = 'loc' -TYPE = 'type' -CIRCUIT_TYPE = 'circuit_type' -SEX = 'sex' -FUNCTIONAL_CIRCUIT_ROLE = 'dont_know_fcrp' -PHENOTYPE = 'phenotype' -OTHER_PHENOTYPE = 'other_phenotype' -SPECIES = 'species' -PROVENANCE = 'provenance' -NOTE_ALERT = 'note_alert' -CIRCUIT_TYPE_MAPPING = { - "http://uri.interlex.org/tgbugs/uris/readable/IntrinsicPhenotype": CircuitType.INTRINSIC, - "http://uri.interlex.org/tgbugs/uris/readable/ProjectionPhenotype": CircuitType.PROJECTION, - "http://uri.interlex.org/tgbugs/uris/readable/MotorPhenotype": CircuitType.MOTOR, - "http://uri.interlex.org/tgbugs/uris/readable/SensoryPhenotype": CircuitType.SENSORY, - "": CircuitType.UNKNOWN -} - NOW = datetime.now().strftime("%Y%m%d%H%M%S") +logger_service = LoggerService() -def not_found_entity(entity): - if len(AnatomicalEntity.objects.filter(ontology_uri=entity)) == 0: - return True - return False +def ingest_statements(update_upstream=False): + statements_list = get_statements_from_neurondm(logger_service_param=logger_service) + overridable_statements = get_overwritable_statements(statements_list) + statements = validate_statements(overridable_statements) + + successful_transaction = True + try: + with transaction.atomic(): + for statement in statements: + sentence, _ = get_or_create_sentence(statement) + create_or_update_connectivity_statement(statement, sentence) -def has_invalid_entities(statement): - invalid_entities = [ + update_forward_connections(statements) + except Exception as e: + logger_service.add_error(LoggableEvent(statement_id=None, entity_id=None, message=str(e))) + successful_transaction = False + logging.error(f"Ingestion aborted due to {e}") + + logger_service.write_errors_to_file() + + if successful_transaction: + if update_upstream: + update_upstream_statements() + logger_service.write_ingested_statements_to_file(statements) + + +def get_overwritable_statements(statements_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + overwritable_statements = [ + statement for statement in statements_list + if not has_invalid_sentence(statement) and not has_invalid_statement(statement) ] - if has_prop(statement[ORIGIN]) and not_found_entity(statement[ORIGIN][0]): - invalid_entities.append({"entity": statement[ORIGIN][0], "prop": ORIGIN}) - if has_prop(statement[DESTINATION]) and not_found_entity(statement[DESTINATION][0][ENTITY_URI]): - invalid_entities.append({"entity": statement[DESTINATION][0][ENTITY_URI], "prop": DESTINATION}) - if has_prop(statement[VIAS]): - for entity in statement[VIAS]: - if not_found_entity(entity[ENTITY_URI]): - invalid_entities.append({"entity": entity[ENTITY_URI], "prop": VIAS}) - - if len(invalid_entities) > 0: - logging.warning( - f'Skip statement {statement[LABEL]} due to the following entities not found in composer db: {invalid_entities}') - return True - return False + return overwritable_statements -def has_multiple_destination(statement): - if len(statement[DESTINATION]) > 1: - logging.warning(f'Skip statement {statement[LABEL]} due to multiple destinations') - return True - else: +def has_invalid_sentence(statement: Dict) -> bool: + try: + sentence = Sentence.objects.get(doi__iexact=statement[ID]) + except Sentence.DoesNotExist: return False + return not can_sentence_be_overwritten(sentence, statement) -def has_invalid_sex(statement): - if has_prop(statement[SEX]) and len(Sex.objects.filter(ontology_uri=statement[SEX][0])) == 0: - logging.warning(f'Skip statement {statement[LABEL]} due to sex {statement[SEX][0]} not found in composer db') - return True - else: +def has_invalid_statement(statement: Dict) -> bool: + try: + connectivity_statement = ConnectivityStatement.objects.get(reference_uri=statement[ID]) + except ConnectivityStatement.DoesNotExist: return False + return not can_statement_be_overwritten(connectivity_statement, statement) -def has_invalid_species(statement): - if has_prop(statement[SPECIES]): - for s in statement[SPECIES]: - try: - Specie.objects.get(ontology_uri=s) - return False - except: - logging.warning(f'Skip statement {statement[LABEL]} due to specie {s} not found in composer db') - return True - return False +def can_statement_be_overwritten(connectivity_statement: ConnectivityStatement, statement) -> bool: + if connectivity_statement.state != CSState.EXPORTED and connectivity_statement.state != CSState.INVALID: + logger_service.add_warning(LoggableEvent(statement[ID], None, STATEMENT_INCORRECT_STATE)) + return False + return True -def get_valid_phenotypes(statement): - valid_phenotypes = [] - if has_prop(statement[PHENOTYPE]): - for p in statement[PHENOTYPE]: - try: - phenotype = Phenotype.objects.get(ontology_uri=p) - valid_phenotypes.append(phenotype) - except: - logging.warning(f'Skip phenotype {p} at statement {statement[LABEL]} not found in composer db ') - return valid_phenotypes +def can_sentence_be_overwritten(sentence: Sentence, statement: Dict) -> bool: + if sentence.state != SentenceState.COMPOSE_NOW: + logger_service.add_warning(LoggableEvent(statement[ID], None, SENTENCE_INCORRECT_STATE)) + return False + return True -def pick_first_phenotype(phenotypes_list, statement): - if len(phenotypes_list) == 0: - return None - elif len(phenotypes_list) > 1: - logging.warning( - f'Multiple phenotypes found, ignore the following {phenotypes_list[1:]} at statement {statement[LABEL]}') - return phenotypes_list[0] +def validate_statements(statements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + db_reference_uris = set(ConnectivityStatement.objects.values_list('reference_uri', flat=True)) + input_statement_ids = {statement[ID] for statement in statements} + statement_ids = input_statement_ids.union(db_reference_uris) -def validate_statements(statement_list): - valid_statements = [] - for statement in statement_list: - if has_multiple_destination(statement): - continue - if has_invalid_entities(statement): - continue - if has_invalid_sex(statement): - continue - if has_invalid_species(statement): - continue + for statement in statements: + # Initialize validation_errors if not already present + if VALIDATION_ERRORS not in statement: + statement[VALIDATION_ERRORS] = ValidationErrors() + + # Validate entities, sex, and species, updating validation_errors accordingly + annotate_invalid_entities(statement) + annotate_invalid_sex(statement) + annotate_invalid_species(statement) + + # Validate forward connection + annotate_invalid_forward_connections(statement, statement_ids) + + return statements + + +def annotate_invalid_entities(statement: Dict) -> bool: + has_invalid_entities = False + + # Consolidate all URIs to check + uris_to_check = list(statement[ORIGINS].anatomical_entities) + uris_to_check.extend(uri for dest in statement[DESTINATIONS] for uri in dest.anatomical_entities) + uris_to_check.extend(uri for via in statement[VIAS] for uri in via.anatomical_entities) + + # Check all URIs and log if not found + for uri in uris_to_check: + if not found_entity(uri): + statement[VALIDATION_ERRORS].entities.add(uri) + has_invalid_entities = True - valid_statements.append(statement) + return has_invalid_entities - return valid_statements +def annotate_invalid_sex(statement: Dict) -> bool: + if statement[SEX]: + if len(statement[SEX]) > 1: + logger_service.add_warning( + LoggableEvent(statement[ID], None, f'Multiple sexes found in statement.')) -def create_artifact_sentence(statement): + first_sex_uri = statement[SEX][0] + if not Sex.objects.filter(ontology_uri=first_sex_uri).exists(): + statement[VALIDATION_ERRORS].sex.add(first_sex_uri) + return True + return False + + +def annotate_invalid_species(statement: Dict) -> bool: + has_invalid_species = False + for species_uri in statement[SPECIES]: + if not Specie.objects.filter(ontology_uri=species_uri).exists(): + statement[VALIDATION_ERRORS].species.add(species_uri) + has_invalid_species = True + return has_invalid_species + + +def annotate_invalid_forward_connections(statement: Dict, statement_ids: Set[str]) -> bool: + has_invalid_forward_connection = False + for reference_uri in statement[FORWARD_CONNECTION]: + if reference_uri not in statement_ids: + statement[VALIDATION_ERRORS].forward_connection.add(reference_uri) + has_invalid_forward_connection = True + return has_invalid_forward_connection + + +def get_or_create_sentence(statement: Dict) -> Tuple[Sentence, bool]: text = f'{statement[LABEL]} created from neurondm on {NOW}' has_sentence_reference = len(statement[SENTENCE_NUMBER]) > 0 + + if len(statement[SENTENCE_NUMBER]) > 1: + logger_service.add_warning( + LoggableEvent(statement[ID], None, f'Multiple sentence numbers found.')) + sentence, created = Sentence.objects.get_or_create( doi__iexact=statement[ID], - defaults={"title": text[0:185], "text": text, + defaults={"title": text[0:185], + "text": text, "doi": statement[ID], "external_ref": statement[SENTENCE_NUMBER][0] if has_sentence_reference else None, - "batch_name": f"neurondm-{NOW}" if has_sentence_reference else None + "batch_name": f"neurondm-{NOW}" if has_sentence_reference else None, + "state": SentenceState.COMPOSE_NOW }, ) if created: logging.info(f"Sentence for neuron {statement[LABEL]} created.") - sentence.save() - return sentence + return sentence, created + + +def create_or_update_connectivity_statement(statement: Dict, sentence: Sentence) -> Tuple[ConnectivityStatement, bool]: + reference_uri = statement[ID] + defaults = { + "sentence": sentence, + "knowledge_statement": statement[LABEL], + "sex": get_sex(statement), + "circuit_type": get_circuit_type(statement), + "functional_circuit_role": get_functional_circuit_role(statement), + "phenotype": get_phenotype(statement), + "projection_phenotype": get_projection_phenotype(statement), + "reference_uri": statement[ID], + "state": CSState.EXPORTED, + } + + connectivity_statement, created = ConnectivityStatement.objects.get_or_create( + reference_uri=reference_uri, + defaults=defaults + ) + if not created: + if has_changes(connectivity_statement, statement, defaults): + ConnectivityStatement.objects.filter(reference_uri=reference_uri).update(**defaults) + fields_to_refresh = [field for field in defaults.keys() if field != 'state'] + connectivity_statement.refresh_from_db(fields=fields_to_refresh) + add_ingestion_system_note(connectivity_statement) -def has_prop(prop): - return True if len(prop) > 0 else False + validation_errors = statement.get(VALIDATION_ERRORS, ValidationErrors()) + if validation_errors.has_errors() and connectivity_statement.state != CSState.INVALID: + error_message = validation_errors.to_string() + do_transition_to_invalid(connectivity_statement, error_message) -def get_value_or_none(model, prop): - if has_prop(prop): - if model == AnatomicalEntity: - return model.objects.filter(ontology_uri=prop[0])[0] + update_many_to_many_fields(connectivity_statement, statement) + + return connectivity_statement, created + + +def has_changes(connectivity_statement, statement, defaults): + validation_errors = statement.get(VALIDATION_ERRORS, ValidationErrors()) + + for field, value in defaults.items(): + if field == 'state': + continue + + if field in ['sex', 'functional_circuit_role', 'phenotype', 'projection_phenotype']: + current_fk_id = getattr(connectivity_statement, f'{field}_id') + new_fk_id = value.id if value is not None else None + if current_fk_id != new_fk_id: + return True else: - return model.objects.get(ontology_uri=prop[0]) - else: - return None + # For simple fields, directly compare the values + if getattr(connectivity_statement, field) != value: + return True + # Check for changes in species + current_species = set(species.ontology_uri for species in connectivity_statement.species.all()) + new_species = set(uri for uri in statement.get(SPECIES, []) if uri not in validation_errors.species) + if current_species != new_species: + return True + + # Check for changes in provenance + current_provenance = set(provenance.uri for provenance in connectivity_statement.provenance_set.all()) + new_provenance = set(statement.get(PROVENANCE) or [statement[ID]]) + if current_provenance != new_provenance: + return True + + # Check for changes in forward_connection + current_forward_connections = set( + connection.reference_uri for connection in connectivity_statement.forward_connection.all()) + new_forward_connections = set( + uri for uri in statement.get(FORWARD_CONNECTION, []) if uri not in validation_errors.forward_connection) + if current_forward_connections != new_forward_connections: + return True -def do_transition_to_compose_now(sentence: Sentence, user: User): - available_transitions = [ - available_state.target - for available_state in sentence.get_available_user_state_transitions( - user - ) + # Check for changes in origins + current_origins = set(origin.ontology_uri for origin in connectivity_statement.origins.all()) + new_origins = set(uri for uri in statement[ORIGINS].anatomical_entities if uri not in validation_errors.entities) + if current_origins != new_origins: + return True + + # Check for changes in vias + current_vias = [ + { + 'anatomical_entities': set(via.anatomical_entities.all().values_list('ontology_uri', flat=True)), + 'from_entities': set(via.from_entities.all().values_list('ontology_uri', flat=True)) + } + for via in connectivity_statement.via_set.order_by('order').all() ] - if SentenceState.COMPOSE_NOW in available_transitions: - # we need to update the state to compose_now when the system user has the permission to do so - sentence = SentenceService(sentence).do_transition( - SentenceState.COMPOSE_NOW, user - ) - sentence.save() - return sentence - - -def do_transition_to_exported(statement: ConnectivityStatement, user: User): - cs = ConnectivityStatementService(statement).do_transition( - CSState.EXPORTED, user + new_vias = statement[VIAS] + + if len(current_vias) != len(new_vias): + return True + + for current_via, new_via in zip(current_vias, new_vias): + new_via_anatomical_entities = set( + uri for uri in new_via.anatomical_entities if uri not in validation_errors.entities) + + new_via_from_entities = set(uri for uri in new_via.from_entities if uri not in validation_errors.entities) + + if (new_via_anatomical_entities != current_via['anatomical_entities'] or + new_via_from_entities != current_via['from_entities']): + return True + + # Check for changes in destinations + current_destinations = connectivity_statement.destinations.all() + new_destinations = statement[DESTINATIONS] + + if len(current_destinations) != len(new_destinations): + return True + + # We may need to change this algorithm when multi-destination is supported by neurondm + + current_destinations_anatomical_entities = set( + uri for destination in current_destinations + for uri in destination.anatomical_entities.all().values_list('ontology_uri', flat=True) + ) + current_destinations_from_entities = set( + uri for destination in current_destinations + for uri in destination.from_entities.all().values_list('ontology_uri', flat=True) ) - cs.save() + new_destinations_anatomical_entities = {uri for new_dest in statement[DESTINATIONS] for uri in + new_dest.anatomical_entities if uri not in validation_errors.entities} -def do_state_transitions(sentence: Sentence): - system_user = User.objects.get(username="system") - s = do_transition_to_compose_now(sentence, system_user) - for statement in s.connectivitystatement_set.all(): - available_transitions = [ - available_state.target - for available_state in statement.get_available_user_state_transitions( - system_user - ) - ] - # after the sentence and statement transitioned to compose_now, we need to update the statement state to exported - if CSState.EXPORTED in available_transitions: - do_transition_to_exported(statement, system_user) - - -def ingest_statements(): - statements_list = get_statements_from_neurondm() - - # validation - # skip statements with more than one destinations (dest field is now an array of ditcs including entity and type). Path is also now an array of dicts including entity and type. - # skip statements with entities not found in db (log the ones skipped) - # skip statements with sex not found in db - # skip statements with species not found in db - valid_statements = validate_statements(statements_list) - - # create an artifact sentence to relate to the statements - for statement in valid_statements: - sentence = create_artifact_sentence(statement) - - reference_uri = statement[ID] - knowledge_statement = statement[LABEL] - origin = get_value_or_none(AnatomicalEntity, statement[ORIGIN]) - destination = AnatomicalEntity.objects.filter(ontology_uri=statement[DESTINATION][0][ENTITY_URI])[ - 0] if has_prop(statement[DESTINATION]) else None - destination_type = statement[DESTINATION][0][TYPE] if has_prop( - statement[DESTINATION]) else DestinationType.UNKNOWN - circuit_type_uri = statement[CIRCUIT_TYPE][0] if has_prop(statement[CIRCUIT_TYPE]) else "" - circuit_type = CIRCUIT_TYPE_MAPPING[circuit_type_uri] - sex = get_value_or_none(Sex, statement[SEX]) - functional_circuit_role = get_value_or_none(FunctionalCircuitRole, statement[FUNCTIONAL_CIRCUIT_ROLE]) - # some values from neurondm's phenotype field are not mapped in composer db. Add only the first one founded in composer db, if any - phenotypes_list = get_valid_phenotypes(statement) - phenotype = pick_first_phenotype(phenotypes_list, statement) - # other phenotypes includes 3 predicates, we will only store hasProjectionPhenotype which belongs to the last element of the other_phenotypes list from neurondm + new_destinations_from_entities = {uri for new_dest in statement[DESTINATIONS] for uri in new_dest.from_entities if + uri not in validation_errors.entities} + + if (current_destinations_anatomical_entities != new_destinations_anatomical_entities or + current_destinations_from_entities != new_destinations_from_entities): + return True + + # Not checking the Notes because they are kept + + return False + + +def get_sex(statement: Dict) -> Sex: + return get_value_or_none(Sex, statement[SEX][0] if statement[SEX] else None) + + +def get_functional_circuit_role(statement: Dict) -> Optional[FunctionalCircuitRole]: + if len(statement[FUNCTIONAL_CIRCUIT_ROLE]) > 1: + logger_service.add_warning( + LoggableEvent(statement[ID], None, f'Multiple functional circuit roles found.')) + + return get_value_or_none( + FunctionalCircuitRole, statement[FUNCTIONAL_CIRCUIT_ROLE][0]) if statement[FUNCTIONAL_CIRCUIT_ROLE] else None + + +def get_circuit_type(statement: Dict): + if statement[CIRCUIT_TYPE]: + if len(statement[CIRCUIT_TYPE]) > 1: + logger_service.add_warning(LoggableEvent(statement[ID], None, f'Multiple circuit types found')) + return CIRCUIT_TYPE_MAPPING.get(statement[CIRCUIT_TYPE][0], CircuitType.UNKNOWN) + else: + logger_service.add_warning(LoggableEvent(statement[ID], None, f'No circuit type found.')) + return CircuitType.UNKNOWN + + +def get_phenotype(statement: Dict) -> Optional[Phenotype]: + if statement[PHENOTYPE]: + if len(statement[PHENOTYPE]) > 1: + logger_service.add_warning(LoggableEvent(statement[ID], None, f'Multiple phenotypes found.')) + + for p in statement[PHENOTYPE]: + try: + phenotype = Phenotype.objects.get(ontology_uri=p) + return phenotype + except Phenotype.DoesNotExist: + pass + + logger_service.add_warning(LoggableEvent(statement[ID], None, f'No valid phenotype found.')) + + return None + + +def get_projection_phenotype(statement: Dict) -> Optional[ProjectionPhenotype]: + if statement[OTHER_PHENOTYPE]: + last_phenotype_uri = statement[OTHER_PHENOTYPE][-1] try: - projection_phenotype = ProjectionPhenotype.objects.get(ontology_uri=statement[OTHER_PHENOTYPE][-1]) - except: - projection_phenotype = None - - # create the statement - - connectivity_statement, created = ConnectivityStatement.objects.get_or_create( - reference_uri__exact=reference_uri, - defaults={ - "sentence": sentence, "knowledge_statement": knowledge_statement, "reference_uri": reference_uri, - "circuit_type": circuit_type, "sex": sex, "functional_circuit_role": functional_circuit_role, - "phenotype": phenotype, "projection_phenotype": projection_phenotype - } - ) - # add the many to many fields: path, species, provenances, notes - if created: - species = Specie.objects.filter(ontology_uri__in=statement[SPECIES]) - connectivity_statement.origins.add(origin) - connectivity_statement.species.add(*species) - - for dest in statement[DESTINATION]: - destination_entity = AnatomicalEntity.objects.filter(ontology_uri=dest[ENTITY_URI]).first() - destination_type = dest.get(TYPE, DestinationType.UNKNOWN) - - if destination_entity: - destination_instance, _ = Destination.objects.get_or_create( - connectivity_statement=connectivity_statement, - defaults={ - "type": destination_type - } - ) - destination_instance.anatomical_entities.add(destination_entity) - - vias_data = [ - Via(connectivity_statement=connectivity_statement, type=via[TYPE], order=index) - for index, via in enumerate(statement[VIAS]) - ] - - created_vias = Via.objects.bulk_create(vias_data) - - for via_instance, via_data in zip(created_vias, statement[VIAS]): - anatomical_entities = AnatomicalEntity.objects.filter( - ontology_uri__in=via_data[ENTITY_URI] - ) - via_instance.anatomical_entities.set(anatomical_entities) - - provenances_list = statement[PROVENANCE][0].split(", ") if has_prop(statement[PROVENANCE]) else [ - statement[ID]] - provenances = (Provenance(connectivity_statement=connectivity_statement, uri=provenance) for provenance in - provenances_list) - Provenance.objects.bulk_create(provenances) - # only 5 statements have a note_alert but they were all filtered out since at least one of their anatomical entities was not found in composer db - if has_prop(statement[NOTE_ALERT]): - Note.objects.create(connectivity_statement=connectivity_statement, - user=User.objects.get(username="system"), type=NoteType.ALERT, - note=statement[NOTE_ALERT][0]) - connectivity_statement.save() - - # transitions sentence from open --> compose_now, and statement from draft --> compose_now --> exported - do_state_transitions(sentence) + projection_phenotype = ProjectionPhenotype.objects.get(ontology_uri=last_phenotype_uri) + return projection_phenotype + except ProjectionPhenotype.DoesNotExist: + pass + else: + logger_service.add_warning(LoggableEvent(statement[ID], None, f'No projection phenotypes found.')) + return None + + +def do_transition_to_invalid(connectivity_statement: ConnectivityStatement, note: str): + system_user = User.objects.get(username="system") + connectivity_statement.invalid(by=system_user) + connectivity_statement.save() + + create_invalid_note(connectivity_statement, note) + + +def create_invalid_note(connectivity_statement: ConnectivityStatement, note: str): + Note.objects.create( + connectivity_statement=connectivity_statement, + user=User.objects.get(username="system"), + type=NoteType.ALERT, + note=f"Invalidated due to the following reason(s): {note}" + ) + + +def update_many_to_many_fields(connectivity_statement: ConnectivityStatement, statement: Dict): + connectivity_statement.origins.clear() + connectivity_statement.species.clear() + # Notes are not cleared because they should be kept + + for provenance in connectivity_statement.provenance_set.all(): + provenance.delete() + + for destination in connectivity_statement.destinations.all(): + destination.delete() + + for via in connectivity_statement.via_set.all(): + via.delete() + + add_origins(connectivity_statement, statement) + add_vias(connectivity_statement, statement) + add_destinations(connectivity_statement, statement) + add_species(connectivity_statement, statement) + add_provenances(connectivity_statement, statement) + add_notes(connectivity_statement, statement) + + +def add_origins(connectivity_statement: ConnectivityStatement, statement: Dict): + origin_uris = statement[ORIGINS].anatomical_entities + origins = [] + for uri in origin_uris: + anatomical_entities = AnatomicalEntity.objects.filter(ontology_uri=uri) + origins.append(anatomical_entities.first()) + + if origins: + connectivity_statement.origins.add(*origins) + + +def add_vias(connectivity_statement: ConnectivityStatement, statement: Dict): + vias_data = [ + Via(connectivity_statement=connectivity_statement, type=via.type, order=via.order) + for via in statement[VIAS] + ] + created_vias = Via.objects.bulk_create(vias_data) + + for via_instance, via_data in zip(created_vias, statement[VIAS]): + for uri in via_data.anatomical_entities: + anatomical_entities = AnatomicalEntity.objects.filter(ontology_uri=uri) + via_instance.anatomical_entities.add(anatomical_entities.first()) + for uri in via_data.from_entities: + from_entity = AnatomicalEntity.objects.filter(ontology_uri=uri).first() + via_instance.from_entities.add(from_entity) + + +def add_destinations(connectivity_statement: ConnectivityStatement, statement: Dict): + destinations_data = [ + Destination(connectivity_statement=connectivity_statement, type=dest.type) + for dest in statement[DESTINATIONS] + ] + + created_destinations = Destination.objects.bulk_create(destinations_data) + + for destination_instance, dest_data in zip(created_destinations, statement[DESTINATIONS]): + for uri in dest_data.anatomical_entities: + anatomical_entity = AnatomicalEntity.objects.filter(ontology_uri=uri).first() + destination_instance.anatomical_entities.add(anatomical_entity) + + for uri in dest_data.from_entities: + from_entity = AnatomicalEntity.objects.filter(ontology_uri=uri).first() + destination_instance.from_entities.add(from_entity) + + +def add_notes(connectivity_statement: ConnectivityStatement, statement: Dict): + for note in statement[NOTE_ALERT]: + Note.objects.create(connectivity_statement=connectivity_statement, + user=User.objects.get(username="system"), + type=NoteType.ALERT, + note=note) + + +def add_provenances(connectivity_statement: ConnectivityStatement, statement: Dict): + # todo: check if it's fine to add all provenances, in the past we were only adding the first + provenances_list = statement[PROVENANCE] if statement[PROVENANCE] else [statement[ID]] + provenances = (Provenance(connectivity_statement=connectivity_statement, uri=provenance) for provenance in + provenances_list) + Provenance.objects.bulk_create(provenances) + + +def add_species(connectivity_statement: ConnectivityStatement, statement: Dict): + species = Specie.objects.filter(ontology_uri__in=statement[SPECIES]) + connectivity_statement.species.add(*species) + + +def add_ingestion_system_note(connectivity_statement: ConnectivityStatement): + Note.objects.create(connectivity_statement=connectivity_statement, + user=User.objects.get(username="system"), + type=NoteType.ALERT, + note=f"Overwritten by manual ingestion") + + +def update_forward_connections(statements: List): + for statement in statements: + connectivity_statement = ConnectivityStatement.objects.get(reference_uri=statement[ID]) + connectivity_statement.forward_connection.clear() + for uri in statement[FORWARD_CONNECTION]: + try: + forward_statement = ConnectivityStatement.objects.get(reference_uri=uri) + except ConnectivityStatement.DoesNotExist: + assert connectivity_statement.state == CSState.INVALID + continue + connectivity_statement.forward_connection.add(forward_statement) + + +def update_upstream_statements(): + invalid_visited = set() + connectivity_statements_invalid_reasons = {} + + initial_invalid_statements = ConnectivityStatement.objects.filter(state=CSState.INVALID) + + for statement in initial_invalid_statements: + propagate_invalid_state(statement, invalid_visited, connectivity_statements_invalid_reasons) + + for statement_uri, (connectivity_statement, reasons) in connectivity_statements_invalid_reasons.items(): + all_reasons = '; '.join(reasons) + + # Perform transition and create a note only if not already invalid + if connectivity_statement.state != CSState.INVALID: + do_transition_to_invalid(connectivity_statement, all_reasons) + else: + create_invalid_note(connectivity_statement, all_reasons) + + +def propagate_invalid_state(connectivity_statement: ConnectivityStatement, invalid_visited: Set, + connectivity_statements_invalid_reasons: Dict, previous_reason: str = ''): + statement_uri = connectivity_statement.reference_uri + + if statement_uri in invalid_visited: + return + + invalid_visited.add(statement_uri) + + # Fetch backward connections directly from the database + backward_connections = ConnectivityStatement.objects.filter( + forward_connection=connectivity_statement + ) + + for backward_cs in backward_connections: + # Build the reason string + current_reason = (f"statement with id {backward_cs.id} is invalid because its " + f"forward connection with id {connectivity_statement.id} is invalid") + if previous_reason: + current_reason += f" because {previous_reason}" + + # Accumulate reasons in connectivity_statements_invalid_reasons, store ConnectivityStatement object with reasons + if backward_cs.reference_uri not in connectivity_statements_invalid_reasons: + connectivity_statements_invalid_reasons[backward_cs.reference_uri] = (backward_cs, []) + connectivity_statements_invalid_reasons[backward_cs.reference_uri][1].append(current_reason) + + # Recursively propagate invalid state + propagate_invalid_state(backward_cs, invalid_visited, connectivity_statements_invalid_reasons, current_reason) diff --git a/backend/composer/services/cs_ingestion/exceptions.py b/backend/composer/services/cs_ingestion/exceptions.py new file mode 100644 index 00000000..40d875b8 --- /dev/null +++ b/backend/composer/services/cs_ingestion/exceptions.py @@ -0,0 +1,6 @@ +class NeuronDMInconsistency(Exception): + def __init__(self, statement_id, entity_id, message): + self.statement_id = statement_id + self.entity_id = entity_id + self.message = message + super().__init__(f"StatementID: {statement_id}, EntityID: {entity_id}, Error: {message}") diff --git a/backend/composer/services/cs_ingestion/helpers.py b/backend/composer/services/cs_ingestion/helpers.py new file mode 100644 index 00000000..aeac4ba7 --- /dev/null +++ b/backend/composer/services/cs_ingestion/helpers.py @@ -0,0 +1,47 @@ +import logging +from typing import Optional, Dict + +from composer.enums import CircuitType +from composer.models import AnatomicalEntity, ConnectivityStatement + +ID = "id" +ORIGINS = "origins" +DESTINATIONS = "destinations" +VIAS = "vias" +LABEL = "label" +SENTENCE_NUMBER = 'sentence_number' +ENTITY_URI = 'loc' +TYPE = 'type' +CIRCUIT_TYPE = 'circuit_type' +FUNCTIONAL_CIRCUIT_ROLE = 'circuit_role' +SEX = 'sex' +PHENOTYPE = 'phenotype' +OTHER_PHENOTYPE = 'other_phenotypes' +SPECIES = 'species' +PROVENANCE = 'provenance' +NOTE_ALERT = 'note_alert' +FORWARD_CONNECTION = "forward_connection" +CIRCUIT_TYPE_MAPPING = { + "http://uri.interlex.org/tgbugs/uris/readable/IntrinsicPhenotype": CircuitType.INTRINSIC, + "http://uri.interlex.org/tgbugs/uris/readable/ProjectionPhenotype": CircuitType.PROJECTION, + "http://uri.interlex.org/tgbugs/uris/readable/MotorPhenotype": CircuitType.MOTOR, + "http://uri.interlex.org/tgbugs/uris/readable/SensoryPhenotype": CircuitType.SENSORY, + "": CircuitType.UNKNOWN +} + +VALIDATION_ERRORS = "validation_errors" + + +def get_value_or_none(model, prop: str): + if prop: + try: + return model.objects.filter(ontology_uri=prop).first() + except model.DoesNotExist: + logging.warning(f'{model.__name__} with uri {prop} not found in the database') + return None + else: + return None + + +def found_entity(uri: str) -> bool: + return AnatomicalEntity.objects.filter(ontology_uri=uri).exists() diff --git a/backend/composer/services/cs_ingestion/logging_service.py b/backend/composer/services/cs_ingestion/logging_service.py new file mode 100644 index 00000000..34076f2f --- /dev/null +++ b/backend/composer/services/cs_ingestion/logging_service.py @@ -0,0 +1,40 @@ +import csv +from typing import List, Dict + +from composer.enums import CSState, SentenceState +from composer.services.cs_ingestion.helpers import ID, LABEL +from composer.services.cs_ingestion.models import LoggableEvent + +AXIOM_NOT_FOUND = "Entity not found in any axiom" +SENTENCE_INCORRECT_STATE = f"Sentence already found and is not in {SentenceState.COMPOSE_NOW} state" +STATEMENT_INCORRECT_STATE = f"Statement already found and is not in {CSState.EXPORTED} or {CSState.INVALID} state" + +INCONSISTENT_AXIOMS = "Region and layer found in different axioms" + + +class LoggerService: + def __init__(self, error_log_path='error_log.csv', success_log_path='success_log.csv'): + self.error_log_path = error_log_path + self.success_log_path = success_log_path + self.errors = [] + self.warnings = [] + + def add_error(self, error: LoggableEvent): + self.errors.append(error) + + def add_warning(self, error: LoggableEvent): + self.warnings.append(error) + + def write_errors_to_file(self): + with open(self.error_log_path, 'w', newline='') as file: + writer = csv.writer(file) + for error in self.errors: + writer.writerow(['Error', error.statement_id, error.entity_id, error.message]) + for warning in self.warnings: + writer.writerow(['Warning', warning.statement_id, warning.entity_id, warning.message]) + + def write_ingested_statements_to_file(self, statements: List[Dict]): + with open(self.success_log_path, 'w', newline='') as file: + writer = csv.writer(file) + for statement in statements: + writer.writerow([statement[ID], statement[LABEL]]) diff --git a/backend/composer/services/cs_ingestion/models.py b/backend/composer/services/cs_ingestion/models.py new file mode 100644 index 00000000..f66d2ecb --- /dev/null +++ b/backend/composer/services/cs_ingestion/models.py @@ -0,0 +1,73 @@ +from enum import Enum +from typing import Set + + +class NeuronDMOrigin: + def __init__(self, anatomical_entities_uri: Set): + self.anatomical_entities = anatomical_entities_uri + + +class NeuronDMVia: + def __init__(self, anatomical_entities_uri: Set, from_entities: Set, order: int, type: str): + self.anatomical_entities = anatomical_entities_uri + self.from_entities = from_entities + self.order = order + self.type = type + + +class NeuronDMDestination: + def __init__(self, anatomical_entities_uri: Set, from_entities: Set, type: str): + self.anatomical_entities = anatomical_entities_uri + self.from_entities = from_entities + self.type = type + + +class LoggableEvent: + def __init__(self, statement_id, entity_id, message): + self.statement_id = statement_id + self.entity_id = entity_id + self.message = message + + +class AxiomType(Enum): + ORIGIN = 'origin' + VIA = 'via' + DESTINATION = 'destination' + + +class ValidationErrors: + def __init__(self): + self.entities = set() + self.sex = set() + self.species = set() + self.forward_connection = set() + self.axiom_not_found = set() + self.non_specified = [] + + def to_string(self) -> str: + error_messages = [] + if self.entities: + error_messages.append(f"Entities not found: {', '.join(self.entities)}") + if self.sex: + error_messages.append(f"Sex information not found: {', '.join(self.sex)}") + if self.species: + error_messages.append(f"Species not found: {', '.join(self.species)}") + if self.forward_connection: + error_messages.append( + f"Forward connections not found: {', '.join(self.forward_connection)}") + if self.axiom_not_found: + error_messages.append(f"Axioms not found for: {', '.join(self.axiom_not_found)}") + if self.non_specified: + error_messages.extend(self.non_specified) + + return '; '.join(error_messages) if error_messages else "No validation errors." + + def has_errors(self) -> bool: + return bool( + self.entities or + self.sex or + self.species or + self.forward_connection or + self.axiom_not_found or + self.non_specified + ) \ No newline at end of file diff --git a/backend/composer/services/cs_ingestion/neurondm_script.py b/backend/composer/services/cs_ingestion/neurondm_script.py index e2c84471..b6b169b7 100644 --- a/backend/composer/services/cs_ingestion/neurondm_script.py +++ b/backend/composer/services/cs_ingestion/neurondm_script.py @@ -1,21 +1,24 @@ import os -from pyontutils.core import OntGraph, OntResIri, OntResPath -from pyontutils.namespaces import rdfs, ilxtr +from typing import Optional, Tuple, List, Set, Dict + +import rdflib +from neurondm import orders from neurondm.core import Config, graphBase, log from neurondm.core import OntTerm, OntId, RDFL -import csv +from pyontutils.core import OntGraph, OntResIri, OntResPath +from pyontutils.namespaces import rdfs, ilxtr +from composer.services.cs_ingestion.exceptions import NeuronDMInconsistency +from composer.services.cs_ingestion.logging_service import LoggerService, AXIOM_NOT_FOUND, INCONSISTENT_AXIOMS +from composer.services.cs_ingestion.models import NeuronDMVia, NeuronDMOrigin, NeuronDMDestination, LoggableEvent, \ + AxiomType, ValidationErrors -def multi_orig_dest(neuron): - for dim in neuron.edges: - if 'hasAxonPre' in dim or 'hasAxonSens' in dim or 'hasSoma' in dim: - objs = list(neuron.getObjects(dim)) - if len(objs) > 1: - return True +logger_service: Optional[LoggerService] = None def makelpesrdf(): collect = [] + def lpes(neuron, predicate): """ get predicates from python bags """ # TODO could add expected cardinality here if needed @@ -31,98 +34,292 @@ def lrdf(neuron, predicate): return lpes, lrdf, collect -def for_composer(n, cull=False): +def for_composer(n): lpes, lrdf, collect = makelpesrdf() + + try: + origins, vias, destinations, validation_errors = get_connections(n, lambda predicate: lpes(n, predicate)) + except NeuronDMInconsistency as e: + if logger_service: + logger_service.add_error(LoggableEvent(e.statement_id, e.entity_id, e.message)) + return None + fc = dict( - id = str(n.id_), - label = str(n.origLabel), - origin = lpes(n, ilxtr.hasSomaLocatedIn), - dest = ( - # XXX looking at this there seems to be a fault assumption that - # there is only a single destination type per statement, this is - # not the case, there is destination type per destination - [dict(loc=l, type='AXON-T') for l in lpes(n, ilxtr.hasAxonPresynapticElementIn)] + - # XXX I strongly reccoment renaming this to SENSORY-T so that the - # short forms are harder to confuse A-T and S-T - [dict(loc=l, type='AFFERENT-T') for l in lpes(n, ilxtr.hasAxonSensorySubcellularElementIn)] - ), - path = ( # TODO pull ordering from partial orders (not implemented in core atm) - [dict(loc=l, type='AXON') for l in lpes(n, ilxtr.hasAxonLocatedIn)] + - # XXX dendrites don't really ... via ... they are all both terminal and via at the same time ... - [dict(loc=l, type='DENDRITE') for l in lpes(n, ilxtr.hasDendriteLocatedIn)] - ), - #laterality = lpes(n, ilxtr.hasLaterality), # left/rigth tricky ? - #projection_laterality = lpes(n, ilxtr.???), # axon located in contra ? - species = lpes(n, ilxtr.hasInstanceInTaxon), - sex = lpes(n, ilxtr.hasBiologicalSex), - circuit_type = lpes(n, ilxtr.hasCircuitRolePhenotype), - phenotype = lpes(n, ilxtr.hasAnatomicalSystemPhenotype), # current meaning of composer phenotype - anatomical_system = lpes(n, ilxtr.hasAnatomicalSystemPhenotype), - # there are a number of dimensions that we aren't converting right now - dont_know_fcrp = lpes(n, ilxtr.hasFunctionalCircuitRolePhenotype), - other_phenotype = ( lpes(n, ilxtr.hasPhenotype) - + lpes(n, ilxtr.hasMolecularPhenotype) - + lpes(n, ilxtr.hasProjectionPhenotype)), - forward_connection = lpes(n, ilxtr.hasForwardConnectionPhenotype), - - # direct references from individual individual neurons - provenance = lrdf(n, ilxtr.literatureCitation), - sentence_number = lrdf(n, ilxtr.sentenceNumber), - note_alert = lrdf(n, ilxtr.alertNote), - # XXX provenance from ApiNATOMY models as a whole is not ingested - # right now because composer lacks support for 1:n from neuron to - # prov, (or rather lacks prov collections) and because it attaches - # prov to the sentece, which does not exist for all neurons - - # TODO more ... - # notes = ? - - # for _ignore, hasClassificationPhenotype is used for ApiNATOMY - # unlikely to be encountered for real neurons any time soon - _ignore = lpes(n, ilxtr.hasClassificationPhenotype), # used to ensure we account for all phenotypes + id=str(n.id_), + label=str(n.origLabel), + origins=origins, + destinations=destinations, + vias=vias, + species=lpes(n, ilxtr.hasInstanceInTaxon), + sex=lpes(n, ilxtr.hasBiologicalSex), + circuit_type=lpes(n, ilxtr.hasCircuitRolePhenotype), + circuit_role=lpes(n, ilxtr.hasFunctionalCircuitRolePhenotype), + phenotype=lpes(n, ilxtr.hasAnatomicalSystemPhenotype), + # classification_phenotype=lpes(n, ilxtr.hasClassificationPhenotype), + other_phenotypes=(lpes(n, ilxtr.hasPhenotype) + + lpes(n, ilxtr.hasMolecularPhenotype) + + lpes(n, ilxtr.hasProjectionPhenotype)), + forward_connection=lpes(n, ilxtr.hasForwardConnectionPhenotype), + provenance=lrdf(n, ilxtr.literatureCitation), + sentence_number=lrdf(n, ilxtr.sentenceNumber), + note_alert=lrdf(n, ilxtr.alertNote), + validation_errors=validation_errors, ) - npo = set((p.e, p.p) for p in n.pes) - cpo = set(collect) - unaccounted_pos = npo - cpo - if unaccounted_pos: - log.warning( - (n.id_, [[n.in_graph.namespace_manager.qname(e) for e in pos] - for pos in unaccounted_pos])) - return {k:v for k, v in fc.items() if v} if cull else fc - - -def location_summary(neurons, services, anatent_simple=False): - import csv - OntTerm.query._services = services - locations = sorted(set( - OntTerm(pe.p) for n in neurons for pe in n.pes - if pe.e in n._location_predicates)) - [_.fetch() for _ in locations] - def key(t): - return (t.prefix, t.label[0].lower() - if isinstance(t, tuple) - else t.lower()) - - if anatent_simple: - header = 'label', 'curie', 'iri' - rows = ( - [header] + - [(_.label, _.curie, _.iri) for _ in sorted(locations, key=key)]) - with open('/tmp/npo-nlp-apinat-location-summary.csv', 'wt') as f: - csv.writer(f, lineterminator='\n').writerows(rows) + return fc + + +def get_connections(n, lpes): + partial_order = n.partialOrder() + + if partial_order is None or len(partial_order) == 0: + raise NeuronDMInconsistency(n.identifier, None, "No partial order found") + + origins_from_axioms = lpes(ilxtr.hasSomaLocatedIn) + destinations_from_axioms = create_uri_type_dict(lpes, {ilxtr.hasAxonPresynapticElementIn: 'AXON-T', + ilxtr.hasAxonSensorySubcellularElementIn: 'AFFERENT-T'}) + vias_from_axioms = create_uri_type_dict(lpes, + {ilxtr.hasAxonLocatedIn: 'AXON', ilxtr.hasDendriteLocatedIn: 'DENDRITE'}) + + tmp_origins, tmp_vias, tmp_destinations, validation_errors = process_connections(partial_order, + set(origins_from_axioms), + vias_from_axioms, + destinations_from_axioms + ) + + validation_errors = validate_partial_order_and_axioms(origins_from_axioms, vias_from_axioms, + destinations_from_axioms, tmp_origins, + tmp_vias, tmp_destinations, validation_errors) + + origins = merge_origins(tmp_origins) + vias = merge_vias(tmp_vias) + destinations = merge_destinations(tmp_destinations) + return origins, vias, destinations, validation_errors + + +def create_uri_type_dict(lpes_func, predicate_type_map): + uri_type_dict = {} + for predicate, type_name in predicate_type_map.items(): + for uri in lpes_func(predicate): + uri_type_dict[uri] = type_name + return uri_type_dict + + +def process_connections(path, origins_from_axioms: Set[str], vias_from_axioms: Dict[str, str], + destinations_from_axioms: Dict[str, str], from_entities: Optional[Set[str]] = None, + depth: int = 0, result: Optional[Dict] = None) -> Tuple[ + List[NeuronDMOrigin], List[NeuronDMVia], List[NeuronDMDestination], ValidationErrors]: + if result is None: + result = {'origins': [], 'destinations': [], 'vias': [], 'validation_errors': ValidationErrors()} + + if isinstance(path, tuple): + if path[0] == rdflib.term.Literal('blank'): + for remaining_path in path[1:]: + process_connections(remaining_path, origins_from_axioms, vias_from_axioms, destinations_from_axioms, + from_entities, depth=depth, result=result) + else: + current_entity = path[0] + + current_entity_uri, current_entity_axiom_types = get_current_entity_metadata(current_entity, + origins_from_axioms, + vias_from_axioms, + destinations_from_axioms) + + if not current_entity_uri or len(current_entity_axiom_types) == 0: + result['validation_errors'].axiom_not_found.add(str(current_entity)) + if logger_service: + logger_service.add_warning(LoggableEvent(None, current_entity, AXIOM_NOT_FOUND)) + else: + from_entities = from_entities or set() + + axiom_type = get_axiom_type(current_entity_axiom_types, path, depth) + + update_result(current_entity_uri, axiom_type, from_entities, depth, result, vias_from_axioms, + destinations_from_axioms) + + depth += 1 + + next_from_entities = {current_entity_uri} if current_entity_uri else from_entities + # Process the next level structures, carrying over from_entities as a set + for remaining_path in path[1:]: + process_connections(remaining_path, origins_from_axioms, vias_from_axioms, destinations_from_axioms, + next_from_entities, depth, result) + + return result['origins'], result['vias'], result['destinations'], result['validation_errors'] + + +def get_current_entity_metadata(current_entity, origins_from_axioms: Set[str], vias_from_axioms: Dict[str, str], + destinations_from_axioms: Dict[str, str]) -> Tuple[Optional[str], List[AxiomType]]: + primary_uri = current_entity.toPython() if not isinstance(current_entity, + orders.rl) else current_entity.region.toPython() + secondary_uri = current_entity.layer.toPython() if isinstance(current_entity, orders.rl) else None + + uris_in_axioms = [ + (origins_from_axioms, AxiomType.ORIGIN), + (vias_from_axioms, AxiomType.VIA), + (destinations_from_axioms, AxiomType.DESTINATION), + ] + + uris_found = {} + for uri_set, node_type in uris_in_axioms: + # Check if the URIs are in the current set of axioms + if primary_uri in uri_set or secondary_uri in uri_set: + # Prefer layer if both region and layer URIs are found + matched_uri = secondary_uri if secondary_uri in uri_set else primary_uri + uris_found.setdefault(matched_uri, []).append(node_type) + + if not uris_found: + return None, [] + + matched_uri, matched_types = next(iter(uris_found.items()), (None, [])) + return matched_uri, matched_types + + +def get_axiom_type(current_entity_axiom_types: List[AxiomType], path, depth: int) -> Optional[AxiomType]: + # Determine the most likely axiom type based on the path context + if not path[1:]: + # If there's nothing after the current entity, it's most likely a Destination + most_likely_type = AxiomType.DESTINATION + elif depth == 0: + # If there's nothing before the current entity, it's most likely an Origin + most_likely_type = AxiomType.ORIGIN else: - header = 'o', 'o_label', 'o_synonym' - rows = ( - [header] + - [(_.iri, _.label, syn) for _ in sorted(locations, key=key) - for syn in _.synonyms]) - with open('/tmp/anatomical_entities.csv', 'wt') as f: - csv.writer(f, lineterminator='\n').writerows(rows) + # Otherwise, it's most likely a Via + most_likely_type = AxiomType.VIA + + # Check if the most likely type is possible + if most_likely_type in current_entity_axiom_types: + return most_likely_type + + # If the most likely type is not possible, choose the first possible one in order of Origin, Via, Destination + for axiom_type in [AxiomType.ORIGIN, AxiomType.VIA, AxiomType.DESTINATION]: + if axiom_type in current_entity_axiom_types: + return axiom_type + + # If no possible type is found, return None + return None + + +def update_result(current_entity_uri: str, axiom_type: AxiomType, from_entities: Set[str], depth: int, result: Dict, + vias_from_axioms: Dict[str, str], + destinations_from_axioms: Dict[str, str]) -> Dict: + if axiom_type == AxiomType.ORIGIN: + result['origins'].append(NeuronDMOrigin({current_entity_uri})) + elif axiom_type == AxiomType.VIA: + result['vias'].append( + NeuronDMVia({current_entity_uri}, from_entities, depth, vias_from_axioms.get(current_entity_uri))) + elif axiom_type == AxiomType.DESTINATION: + result['destinations'].append( + NeuronDMDestination({current_entity_uri}, from_entities, destinations_from_axioms.get(current_entity_uri))) + return result + + +def validate_partial_order_and_axioms(origins_from_axioms, vias_from_axioms, destinations_from_axioms, tmp_origins, + tmp_vias, tmp_destinations, + validation_errors: ValidationErrors) -> ValidationErrors: + anatomical_uris_origins = extract_anatomical_uris(tmp_origins) + anatomical_uris_vias = extract_anatomical_uris(tmp_vias) + anatomical_uris_destinations = extract_anatomical_uris(tmp_destinations) + + # Validate that all axioms were used + if anatomical_uris_origins != set(origins_from_axioms): + validation_errors.non_specified.append( + f"Mismatch in anatomical URIs for origins: expected {origins_from_axioms}, found {anatomical_uris_origins}") + if anatomical_uris_vias != set(vias_from_axioms.keys()): + validation_errors.non_specified.append( + f"Mismatch in anatomical URIs for vias: expected {vias_from_axioms.keys()}, found {anatomical_uris_vias}") + if anatomical_uris_destinations != set(destinations_from_axioms.keys()): + validation_errors.non_specified.append( + f"Mismatch in anatomical URIs for destinations: expected {destinations_from_axioms.keys()}, found {anatomical_uris_destinations}") + + return validation_errors -def main(local=False, anatomical_entities=False, anatent_simple=False): - # if (local := True, anatomical_entities := True, anatent_simple := False): +def extract_anatomical_uris(entities_list): + return set(uri for entity in entities_list for uri in entity.anatomical_entities) + + +def merge_origins(origins: List[NeuronDMOrigin]) -> NeuronDMOrigin: + merged_anatomical_entities = set() + for origin in origins: + merged_anatomical_entities.update(origin.anatomical_entities) + + return NeuronDMOrigin(merged_anatomical_entities) + + +def merge_vias(vias: List[NeuronDMVia]) -> List[NeuronDMVia]: + vias = merge_vias_by_from_entities(vias) + vias = merge_vias_by_anatomical_entities(vias) + return assign_unique_order_to_vias(vias) + + +def merge_vias_by_from_entities(vias: List[NeuronDMVia]) -> List[NeuronDMVia]: + merged_vias = {} + for via in vias: + key = (frozenset(via.anatomical_entities), via.type) + if key not in merged_vias: + merged_vias[key] = NeuronDMVia(via.anatomical_entities, set(), via.order, via.type) + merged_vias[key].from_entities.update(via.from_entities) + merged_vias[key].order = max(merged_vias[key].order, via.order) + + return list(merged_vias.values()) + + +def merge_vias_by_anatomical_entities(vias: List[NeuronDMVia]) -> List[NeuronDMVia]: + merged_vias = {} + for via in vias: + key = (via.type, frozenset(via.from_entities)) + if key not in merged_vias: + merged_vias[key] = NeuronDMVia(set(), via.from_entities, via.order, via.type) + merged_vias[key].anatomical_entities.update(via.anatomical_entities) + merged_vias[key].order = max(merged_vias[key].order, via.order) + + return list(merged_vias.values()) + + +def assign_unique_order_to_vias(vias: List[NeuronDMVia]) -> List[NeuronDMVia]: + # Sort vias by their original order + sorted_vias = sorted(vias, key=lambda x: x.order) + + # Assign new orders to maintain uniqueness and relative order + for new_order, via in enumerate(sorted_vias): + via.order = new_order + + return sorted_vias + + +def merge_destinations(destinations: List[NeuronDMDestination]) -> List[NeuronDMDestination]: + destinations = merge_destinations_by_from_entities(destinations) + return merge_destinations_by_anatomical_entities(destinations) + + +def merge_destinations_by_anatomical_entities(destinations: List[NeuronDMDestination]) -> List[NeuronDMDestination]: + merged_destinations = {} + for destination in destinations: + key = (frozenset(destination.anatomical_entities), destination.type) + if key not in merged_destinations: + merged_destinations[key] = NeuronDMDestination(destination.anatomical_entities, set(), destination.type) + merged_destinations[key].from_entities.update(destination.from_entities) + + return list(merged_destinations.values()) + + +def merge_destinations_by_from_entities(destinations: List[NeuronDMDestination]) -> List[NeuronDMDestination]: + merged_destinations = {} + for destination in destinations: + key = frozenset(destination.from_entities) + if key not in merged_destinations: + merged_destinations[key] = NeuronDMDestination(set(), destination.from_entities, destination.type) + merged_destinations[key].anatomical_entities.update(destination.anatomical_entities) + + return list(merged_destinations.values()) + + +## Based on: +## https://github.com/tgbugs/pyontutils/blob/30c415207b11644808f70c8caecc0c75bd6acb0a/neurondm/docs/composer.py#L668-L698 +def main(local=False, logger_service_param=Optional[LoggerService]): + global logger_service + logger_service = logger_service_param config = Config('random-merge') g = OntGraph() # load and query graph @@ -171,31 +368,12 @@ def main(local=False, anatomical_entities=False, anatent_simple=False): [g.add((s, rdfs.label, o)) for s, o in ori.graph[:rdfs.label:]] config.load_existing(g) - neurons = config.neurons() # scigraph required here if deps not removed above - - # ingest to composer starts here - mvp_ingest = [n for n in neurons if not multi_orig_dest(n)] - - dims = set(p for n in neurons for p in n.edges) # for reference - fcs = [for_composer(n) for n in mvp_ingest] - _fcne = [for_composer(n, cull=True) for n in mvp_ingest] # exclude empties for easier manual review - - # example neuron - n = mvp_ingest[0] - fc = for_composer(n) - - if anatomical_entities: - location_summary(neurons, _noloc_query_services, anatent_simple) - + neurons = config.neurons() - myFile = open('./composer/services/cs_ingestion/neurons.csv', 'w') - writer = csv.DictWriter(myFile, fieldnames=['id','label', 'origin','dest', 'path', 'species', 'sex', 'circuit_type', 'phenotype', 'anatomical_system', 'dont_know_fcrp', 'other_phenotype','forward_connection', 'provenance', 'sentence_number', 'note_alert', '_ignore']) - writer.writeheader() - writer.writerows(fcs) - myFile.close() + fcs = [for_composer(n) for n in neurons] + composer_statements = [item for item in fcs if item is not None] - # breakpoint() - return fcs + return composer_statements if __name__ == '__main__': diff --git a/backend/composer/services/state_services.py b/backend/composer/services/state_services.py index b8f8b568..a7b9990d 100644 --- a/backend/composer/services/state_services.py +++ b/backend/composer/services/state_services.py @@ -173,6 +173,11 @@ def has_permission_to_transition_to_exported(connectivity_statement, user): # only system users can transition to EXPORTED return user.username == 'system' + @staticmethod + def has_permission_to_transition_to_invalid(connectivity_statement, user): + # only system users can transition to INVALID + return user.username == 'system' + @staticmethod def add_important_tag(connectivity_statement): # when a ConnectivityStatement record goes to compose_now state and the previous diff --git a/backend/run_ingest.sh b/backend/run_ingest.sh new file mode 100755 index 00000000..26429d50 --- /dev/null +++ b/backend/run_ingest.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Usage: +# ./run_ingest.sh [update_upstream] + +# Check if the version argument is provided +if [ -z "$1" ]; then + echo "Error: Version argument is required." + exit 1 +fi + +version=$1 +update_upstream_flag=$2 # Optional second argument + +# Check if the version is greater than 0.1.8 +if [[ "$(printf '%s\n' "0.1.8" "$version" | sort -V | head -n1)" = "0.1.8" ]]; then + echo "Installing neurondm version $version..." + pip install neurondm==$version +else + echo "Error: Version must be greater than 0.1.8." + exit 1 +fi + +# Update the ingest_statements command based on the update_upstream_flag +if [ "$update_upstream_flag" == "update_upstream" ]; then + echo "Running ingest_statements with update_upstream flag..." + python manage.py ingest_statements --update_upstream +else + echo "Running ingest_statements without update_upstream flag..." + python manage.py ingest_statements +fi diff --git a/backend/tests/test_neurondm_processing.py b/backend/tests/test_neurondm_processing.py new file mode 100644 index 00000000..08693e7f --- /dev/null +++ b/backend/tests/test_neurondm_processing.py @@ -0,0 +1,160 @@ +import unittest +import rdflib + +from composer.services.cs_ingestion.neurondm_script import process_connections, merge_origins, merge_vias, \ + merge_destinations + + +class TestProcessConnections(unittest.TestCase): + def test_process_connections_basic(self): + origins_from_axioms = {'Oa'} + vias_from_axioms = {'V1a': 'AXON'} + destinations_from_axioms = {'Da': 'AXON-T'} + + mock_path = ( + rdflib.term.URIRef('Oa'), + ( + rdflib.term.URIRef('V1a'), + ( + rdflib.term.URIRef('Da'), + ) + ) + ) + + tmp_origins, tmp_vias, tmp_destinations, validation_errors = process_connections( + mock_path, + origins_from_axioms, + vias_from_axioms, + destinations_from_axioms + ) + + origins = merge_origins(tmp_origins) + vias = merge_vias(tmp_vias) + destinations = merge_destinations(tmp_destinations) + + self.assertEqual(len(origins.anatomical_entities), 1) + self.assertEqual(len(vias), 1) + self.assertEqual(len(destinations), 1) + + def test_process_connections_jump(self): + origins_from_axioms = {'Oa', 'Ob'} + vias_from_axioms = { + 'V1a': 'AXON', + 'V2a': 'AXON', + 'V3a': 'AXON' + } + destinations_from_axioms = {'Da': 'AXON-T'} + + mock_path_complex = ( + rdflib.term.Literal('blank'), + # Path from the first origin + ( + rdflib.term.URIRef('Oa'), + ( + rdflib.term.URIRef('V1a'), + ( + rdflib.term.URIRef('V2a'), + ( + rdflib.term.URIRef('V3a'), + ( + rdflib.term.URIRef('Da'), + ) + ) + ) + + ) + ), + # Path from the second origin + ( + rdflib.term.URIRef('Ob'), + ( + rdflib.term.Literal('blank'), + ( + rdflib.term.URIRef('V1a'), + ( + rdflib.term.URIRef('V2a'), + ( + rdflib.term.URIRef('V3a'), + ( + rdflib.term.URIRef('Da'), + ) + ) + ) + ), + ( + rdflib.term.URIRef('V3a'), + ( + rdflib.term.URIRef('Da'), + ) + ) + + ) + ), + ) + + tmp_origins, tmp_vias, tmp_destinations, validation_errors = process_connections( + mock_path_complex, + origins_from_axioms, + vias_from_axioms, + destinations_from_axioms + ) + + origins = merge_origins(tmp_origins) + vias = merge_vias(tmp_vias) + destinations = merge_destinations(tmp_destinations) + + self.assertEqual(len(origins.anatomical_entities), 2) + self.assertEqual(len(vias), 3) + self.assertEqual(len(destinations), 1) + via_orders = [via.order for via in vias] + self.assertEqual(len(via_orders), len(set(via_orders)), "Via orders are not unique") + + def test_process_connections_multiple_predicates(self): + origins_from_axioms = {'Oa', 'Ob'} + vias_from_axioms = { + 'V1a': 'AXON', + 'Ob': 'AXON' + } + destinations_from_axioms = {'Da': 'AXON-T'} + + mock_path = ( + rdflib.term.Literal('blank'), + # Path from the first origin + ( + rdflib.term.URIRef('Oa'), + ( + rdflib.term.URIRef('V1a'), + ( + rdflib.term.URIRef('Ob'), + ( + rdflib.term.URIRef('Da'), + ) + ) + ) + ), + # Path from the second origin + ( + rdflib.term.URIRef('Ob'), + ( + rdflib.term.URIRef('Da'), + ) + ), + ) + tmp_origins, tmp_vias, tmp_destinations, validation_errors = process_connections( + mock_path, + origins_from_axioms, + vias_from_axioms, + destinations_from_axioms + ) + + origins = merge_origins(tmp_origins) + vias = merge_vias(tmp_vias) + destinations = merge_destinations(tmp_destinations) + + self.assertEqual(len(origins.anatomical_entities), 2) + self.assertEqual(len(vias), 2) + self.assertEqual(len(destinations), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/frontend/src/apiclient/backend/.openapi-generator/FILES b/frontend/src/apiclient/backend/.openapi-generator/FILES index 16b445ee..a80cd4f0 100644 --- a/frontend/src/apiclient/backend/.openapi-generator/FILES +++ b/frontend/src/apiclient/backend/.openapi-generator/FILES @@ -1,6 +1,5 @@ .gitignore .npmignore -.openapi-generator-ignore api.ts base.ts common.ts