diff --git a/scripts/run_bone.py b/scripts/run_bone.py index d65f71f..52fd4bd 100644 --- a/scripts/run_bone.py +++ b/scripts/run_bone.py @@ -1,14 +1,17 @@ import os from google.protobuf.json_format import MessageToJson -from cdapython import Q from oncoexporter.cda import CdaTableImporter, configure_cda_table_importer -table_importer: CdaTableImporter = configure_cda_table_importer() +###### Input parameters ######## +table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True) -Tsite = Q('primary_diagnosis_site = "%bone%" OR primary_diagnosis_site = "%osseous%"', ) +Query = {'match_any': ['primary_diagnosis_site = *bone*', + 'primary_diagnosis_site = *osseous*'], + 'data_source': 'GDC'} cohort_name = 'Bone' -p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name) + +p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name) result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name)) os.makedirs(result_dir, exist_ok=True) @@ -19,3 +22,4 @@ with open(file_path, 'w') as fh: json = MessageToJson(pp) fh.write(json) + diff --git a/scripts/run_cervix.py b/scripts/run_cervix.py index 45bec1b..8ac2cac 100644 --- a/scripts/run_cervix.py +++ b/scripts/run_cervix.py @@ -1,15 +1,17 @@ import os from google.protobuf.json_format import MessageToJson -from cdapython import Q from oncoexporter.cda import CdaTableImporter, configure_cda_table_importer -table_importer: CdaTableImporter = configure_cda_table_importer() +###### Input parameters ######## +table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True) -Tsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', ) +Query = {'match_any': ['primary_diagnosis_site = *uter*', + 'primary_diagnosis_site = *cerv*'], + 'data_source': 'GDC'} cohort_name = 'Cervix' -p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name) +p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name) result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name)) os.makedirs(result_dir, exist_ok=True) @@ -20,3 +22,4 @@ with open(file_path, 'w') as fh: json = MessageToJson(pp) fh.write(json) + diff --git a/src/oncoexporter/cda/_gdc.py b/src/oncoexporter/cda/_gdc.py index a5648d1..33b095c 100644 --- a/src/oncoexporter/cda/_gdc.py +++ b/src/oncoexporter/cda/_gdc.py @@ -43,6 +43,8 @@ def __init__( )) self._case_fields = ','.join(( "demographic.vital_status", + "diagnoses.days_to_last_follow_up", + "diagnoses.age_at_diagnosis", )) def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any: @@ -53,7 +55,7 @@ def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[st return data else: raise ValueError(f'Failed to fetch data from {url} due to {response.status_code}: {response.reason}') - + def _prepare_query_params(self, subject_id: str, fields: typing.List[str]=None) -> typing.Dict: filters = { "op": "in", @@ -80,7 +82,23 @@ def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpret mutation_details.append(vi) return mutation_details - + + def _calculate_survival_time_when_alive(self, subject_id: str) -> typing.Optional[int]: + diagnosis_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields) + hits = diagnosis_data.get("data", {}).get("hits", []) + if len(hits) > 1: + self._logger.warning(f"Multiple diagnoses found for subject {subject_id}. Using the first one.") + if hits: + diagnoses = hits[0].get("diagnoses", []) + if diagnoses: + last_follow_up = diagnoses[0].get("days_to_last_follow_up") + # Via these docs: + # https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=diagnosis&anchor=days_to_last_follow_up + # last_follow_up is days since diagnosis (not days since birth)^^ + if last_follow_up is None: + self._logger.info(f"Cannot calculate survival time for subject {subject_id} due to missing data") + return last_follow_up + def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus: survival_data = self._fetch_data_from_gdc(self._survival_url, subject_id) vital_status_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields) @@ -100,14 +118,21 @@ def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus: vital_status = demographic.get("vital_status") vital_status_obj = pp.VitalStatus() - vital_status_obj.survival_time_in_days = int(survival_time) if survival_time is not None else 0 if vital_status == "Dead": vital_status_obj.status = pp.VitalStatus.Status.DECEASED + if survival_time is not None: + vital_status_obj.survival_time_in_days = int(survival_time) elif vital_status == "Alive": vital_status_obj.status = pp.VitalStatus.Status.ALIVE + survival_time = self._calculate_survival_time_when_alive(subject_id) + if survival_time is not None: + if survival_time < 0: + self._logger.warning(f"Survival time for subject {subject_id} is negative: {survival_time}") + else: + vital_status_obj.survival_time_in_days = int(survival_time) else: vital_status_obj.status = pp.VitalStatus.Status.UNKNOWN_STATUS - + return vital_status_obj def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation: @@ -161,4 +186,4 @@ def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]: ann = tx['annotation']['hgvsc'] expression.value = f'{tx_id}:{ann}' - return expression \ No newline at end of file + return expression