Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Survival time calculation for subjects that are still "Alive" #88

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions scripts/run_bone.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import os
from google.protobuf.json_format import MessageToJson

from cdapython import Q
from oncoexporter.cda import CdaTableImporter, configure_cda_table_importer

table_importer: CdaTableImporter = configure_cda_table_importer()
###### Input parameters ########
table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True)

Tsite = Q('primary_diagnosis_site = "%bone%" OR primary_diagnosis_site = "%osseous%"', )
Query = {'match_any': ['primary_diagnosis_site = *bone*',
'primary_diagnosis_site = *osseous*'],
'data_source': 'GDC'}
cohort_name = 'Bone'
p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name)

p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name)

result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name))
os.makedirs(result_dir, exist_ok=True)
Expand All @@ -19,3 +22,4 @@
with open(file_path, 'w') as fh:
json = MessageToJson(pp)
fh.write(json)

11 changes: 7 additions & 4 deletions scripts/run_cervix.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
from google.protobuf.json_format import MessageToJson

from cdapython import Q
from oncoexporter.cda import CdaTableImporter, configure_cda_table_importer

table_importer: CdaTableImporter = configure_cda_table_importer()
###### Input parameters ########
table_importer: CdaTableImporter = configure_cda_table_importer(use_cache=True)

Tsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', )
Query = {'match_any': ['primary_diagnosis_site = *uter*',
'primary_diagnosis_site = *cerv*'],
'data_source': 'GDC'}
cohort_name = 'Cervix'
p = table_importer.get_ga4gh_phenopackets(Tsite, cohort_name=cohort_name)

p = table_importer.get_ga4gh_phenopackets(Query, cohort_name=cohort_name)

result_dir = os.path.abspath(os.path.join('phenopackets', cohort_name))
os.makedirs(result_dir, exist_ok=True)
Expand All @@ -20,3 +22,4 @@
with open(file_path, 'w') as fh:
json = MessageToJson(pp)
fh.write(json)

35 changes: 30 additions & 5 deletions src/oncoexporter/cda/_gdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def __init__(
))
self._case_fields = ','.join((
"demographic.vital_status",
"diagnoses.days_to_last_follow_up",
"diagnoses.age_at_diagnosis",
))

def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[str]=None) -> typing.Any:
Expand All @@ -53,7 +55,7 @@ def _fetch_data_from_gdc(self, url: str, subject_id: str, fields: typing.List[st
return data
else:
raise ValueError(f'Failed to fetch data from {url} due to {response.status_code}: {response.reason}')

def _prepare_query_params(self, subject_id: str, fields: typing.List[str]=None) -> typing.Dict:
filters = {
"op": "in",
Expand All @@ -80,7 +82,23 @@ def fetch_variants(self, subject_id: str) -> typing.Sequence[pp.VariantInterpret
mutation_details.append(vi)

return mutation_details


def _calculate_survival_time_when_alive(self, subject_id: str) -> typing.Optional[int]:
diagnosis_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields)
hits = diagnosis_data.get("data", {}).get("hits", [])
if len(hits) > 1:
self._logger.warning(f"Multiple diagnoses found for subject {subject_id}. Using the first one.")
if hits:
diagnoses = hits[0].get("diagnoses", [])
if diagnoses:
last_follow_up = diagnoses[0].get("days_to_last_follow_up")
# Via these docs:
# https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-definition-view&id=diagnosis&anchor=days_to_last_follow_up
# last_follow_up is days since diagnosis (not days since birth)^^
if last_follow_up is None:
self._logger.info(f"Cannot calculate survival time for subject {subject_id} due to missing data")
return last_follow_up

def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus:
survival_data = self._fetch_data_from_gdc(self._survival_url, subject_id)
vital_status_data = self._fetch_data_from_gdc(self._cases, subject_id, self._case_fields)
Expand All @@ -100,14 +118,21 @@ def fetch_vital_status(self, subject_id: str) -> pp.VitalStatus:
vital_status = demographic.get("vital_status")

vital_status_obj = pp.VitalStatus()
vital_status_obj.survival_time_in_days = int(survival_time) if survival_time is not None else 0
if vital_status == "Dead":
vital_status_obj.status = pp.VitalStatus.Status.DECEASED
if survival_time is not None:
vital_status_obj.survival_time_in_days = int(survival_time)
elif vital_status == "Alive":
vital_status_obj.status = pp.VitalStatus.Status.ALIVE
survival_time = self._calculate_survival_time_when_alive(subject_id)
if survival_time is not None:
if survival_time < 0:
self._logger.warning(f"Survival time for subject {subject_id} is negative: {survival_time}")
else:
vital_status_obj.survival_time_in_days = int(survival_time)
else:
vital_status_obj.status = pp.VitalStatus.Status.UNKNOWN_STATUS

return vital_status_obj

def _map_mutation_to_variant_interpretation(self, mutation) -> pp.VariantInterpretation:
Expand Down Expand Up @@ -161,4 +186,4 @@ def _map_consequence_to_expression(csq) -> typing.Optional[pp.Expression]:
ann = tx['annotation']['hgvsc']
expression.value = f'{tx_id}:{ann}'

return expression
return expression
Loading