Merge pull request #100 from mmcdermott/dev

Dev
mmcdermott · Nov 10, 2024 · 97d5f5a · 97d5f5a
2 parents 739f338 + 1443717
commit 97d5f5a
Show file tree

Hide file tree

Showing 16 changed files with 424 additions and 8 deletions.
diff --git a/MIMICIV_TUTORIAL/README.MD b/MIMICIV_TUTORIAL/README.MD
@@ -4,15 +4,20 @@ This is an example of how to extract a MEDS dataset from MIMIC-IV. All scripts i
 be run **not** from this directory but from the root directory of this entire repository (e.g., one directory
 up from this one).
 
+For this tutorial make sure you are cd'd into the root directory of the repository.
+
 ## Extract MIMIC-IV MEDS Data
 
-### Download pre-extracted data from gpc
+To get MEDS data, we can either download pre-extracted data and tasks from gcp (Option 1) or extract the data from scratch (Option 2).
+
+### (Option 1) Download pre-extracted data from gcp
 
 Install the [gcloud client](https://cloud.google.com/sdk/docs/install) and then run the following command to download the MEDS data from the gcp bucket:
 
 ```console
-export MIMICIV_MEDS_DIR=??? # set to the directory in which you want to store the raw MIMIC-IV data
-export OUTPUT_TABULARIZATION_DIR=??? # set to the output directory for the tabularized data
+export ROOT_DIR=??? # set to the directory in which you want to store all data
+export MIMICIV_MEDS_DIR=${ROOT_DIR}/meds/ # set to the directory in which you want to store the raw MIMIC-IV data
+export OUTPUT_TABULARIZATION_DIR=${ROOT_DIR}/meds_tab/ # set to the output directory for the tabularized data
 export OUTPUT_MODEL_DIR=${OUTPUT_TABULARIZATION_DIR}/results/ # set to the base results directory
 
 cd $MIMICIV_MEDS_DIR
@@ -29,7 +34,7 @@ pip install "meds-tab==0.0.5"
 
 Next we need to get some labels for our tasks. We will use the `long_los` and `icu_mortality` tasks as examples.
 
-### Download pre-extracted labels from gcp:
+### (Option 1) Download pre-extracted labels from gcp:
 
 ```console
 TASKS=("long_los" "icu_mortality")
@@ -43,13 +48,48 @@ do
 done
 ```
 
+### (Option 2) Extract MEDS data from scratch
+
+Follow the instructions in the [MEDS_transforms MIMICIV TUTORIAL](https://github.com/mmcdermott/MEDS_transforms/blob/main/MIMIC-IV_Example/README.md)
+
+### (Option 2) Use ACES to extract labels using a task config definition:
+
+We can manually extract the supervised task labels from our meds dataset using [aces](https://github.com/justin13601/ACES/tree/main). First install aces:
+
+```console
+conda create -n aces python=3.12
+conda activate aces
+pip install es-aces==0.5.0
+pip install hydra-joblib-launcher
+```
+
+Second, run the following command to extract the supervised task labels:
+
+```console
+TASKS=(
+    "mortality/in_hospital/first_24h"
+    "mortality/in_icu/first_24h"
+    "mortality/post_hospital_discharge/1y"
+    "readmission/30d"
+)
+TASKS_DIR="$MIMICIV_MEDS_DIR/tasks/" # set to the directory in which you want to store all tasks
+
+for TASK_NAME in "${TASKS[@]}"; do
+    SINGLE_TASK_DIR="${MIMICIV_MEDS_DIR}/tasks/${TASK_NAME}"
+    mkdir -p $SINGLE_TASK_DIR # create a directory for the task
+    cp MIMICIV_TUTORIAL/tasks/${TASK_NAME}.yaml "${SINGLE_TASK_DIR}.yaml"
+    aces-cli --multirun hydra/launcher=joblib data=sharded data.standard=meds data.root="$MIMICIV_MEDS_DIR/data" "data.shard=$(expand_shards $MIMICIV_MEDS_DIR/data)" cohort_dir="$TASKS_DIR" cohort_name="$TASK_NAME"
+done
+```
+
 ## Run Tabularization and XGBoost Baseline
 
 ```console
 export N_PARALLEL_WORKERS=48 # Set number of workers
-export RESHARD_DIR=??? # set to directory to output reshareded meds data
+export RESHARD_DIR=${ROOT_DIR}/reshareded_meds/ # set to directory to output reshareded meds data
+TASKS_STR=$(echo ${TASKS[@]} | tr ' ' ',')
 bash MIMICIV_TUTORIAL/tabularize_meds.sh "${MIMICIV_MEDS_DIR}" "$RESHARD_DIR" $OUTPUT_TABULARIZATION_DIR \
-    "long_los,icu_mortality" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \
+    "${TASKS_STR}" $TASKS_DIR $OUTPUT_MODEL_DIR $N_PARALLEL_WORKERS \
     "tabularization.aggs=[static/present,code/count,value/count,value/sum,value/sum_sqd,value/min,value/max]" \
     "tabularization.window_sizes=[2h,12h,1d,7d,30d,365d,full]"
 ```
diff --git a/MIMICIV_TUTORIAL/tasks/README.md b/MIMICIV_TUTORIAL/tasks/README.md
@@ -0,0 +1,14 @@
+# Task Criteria Files
+
+This folder contains the task configuration files used to test MEDS-TAB on various tasks over MIMIC-IV.
+
+Each directory in this structure should contain a `README.md` file that describes that sub-collection of
+tasks.
+
+All task criteria files are [ACES](https://github.com/justin13601/ACES) task-configuration `yaml` files.
+Currently, all tasks should be interpreted as _binary classification_ tasks, where the output label (indicated
+in the configuration file) should be interpreted as a `False` or `0` label if the ACES derived task dataframe
+has a `label` column with a value of `0`, and a `True` or `1` label if the ACES derived task dataframe has a
+label column with any value greater than `0`.
+
+Task criteria files should each contain a free-text `description` key describing the task.
diff --git a/MIMICIV_TUTORIAL/tasks/los/in_hospital/first_48h.yaml b/MIMICIV_TUTORIAL/tasks/los/in_hospital/first_48h.yaml
@@ -0,0 +1,39 @@
+description: >-
+  This file specifies the base configuration for the prediction of a hospital los being greater than 3days,
+  leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window
+  and the target window. Patients who die or are discharged in the gap window are excluded. Note that this
+  task is in-**hospital** los, not in-**ICU** los which is a different task.
+
+predicates:
+  hospital_admission:
+    code: { regex: "HOSPITAL_ADMISSION//.*" }
+  hospital_discharge:
+    code: { regex: "HOSPITAL_DISCHARGE//.*" }
+  death:
+    code: MEDS_DEATH
+  discharge_or_death:
+    expr: or(hospital_discharge, death)
+
+trigger: hospital_admission
+
+windows:
+  input:
+    start: NULL
+    end: trigger + 48h
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+  gap:
+    start: input.end
+    end: start + 24h
+    start_inclusive: False
+    end_inclusive: True
+    has:
+      hospital_admission: (None, 0)
+      discharge_or_death: (None, 0)
+  target:
+    start: trigger
+    end: start + 3d
+    start_inclusive: False
+    end_inclusive: True
+    label: discharge_or_death
diff --git a/MIMICIV_TUTORIAL/tasks/los/in_icu/first_48h.yaml b/MIMICIV_TUTORIAL/tasks/los/in_icu/first_48h.yaml
@@ -0,0 +1,39 @@
+description: >-
+  This file specifies the base configuration for the prediction of a icu los being greater than 3days,
+  leveraging only the first 48 hours of data after admission, with a 24 hour gap between the input window
+  and the target window. Patients who die or are discharged in the gap window are excluded. Note that this
+  task is in-**ICU** los, not in-**HOSPITAL** los which is a different task.
+
+predicates:
+  icu_admission:
+    code: { regex: "ICU_ADMISSION//.*" }
+  icu_discharge:
+    code: { regex: "ICU_DISCHARGE//.*" }
+  death:
+    code: MEDS_DEATH
+  discharge_or_death:
+    expr: or(icu_discharge, death)
+
+trigger: icu_admission
+
+windows:
+  input:
+    start: NULL
+    end: trigger + 48h
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+  gap:
+    start: input.end
+    end: start + 24h
+    start_inclusive: False
+    end_inclusive: True
+    has:
+      icu_admission: (None, 0)
+      discharge_or_death: (None, 0)
+  target:
+    start: trigger
+    end: start + 3d
+    start_inclusive: False
+    end_inclusive: True
+    label: discharge_or_death
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/README.md b/MIMICIV_TUTORIAL/tasks/mortality/README.md
@@ -0,0 +1,23 @@
+# Mortality Prediction
+
+This folder contains tasks for predicting mortality in patients in a variety of clinical contexts. Mortality
+is a common prediciton target because
+
+1. The "death" label is unambiguous, clearly important, and (often) easy to collect.
+2. Mortality is a common outcome in clinical research, and many studies have collected data that can be used
+    to predict mortality.
+3. Mortality prediction within specific time-frames can be used as a proxy for the need for greater clinical
+    attention on select patients, and mortality has accordingly been used as a training proxy for the
+    development of clinical risk scores in various settings. We do not advocate this use of mortality in all
+    contexts, but it is a common use-case for mortality prediction tasks.
+
+We break down mortality a variety of categories, all of which are contained in the sub-folders of this task
+collection, and described therein.
+
+Missing:
+
+1. Mortality for patients with specific diseases:
+    \- \[ \] Sepsis:
+    - https://translational-medicine.biomedcentral.com/articles/10.1186/s12967-020-02620-5
+
+Some common references for the prediction of mortality in various settings include: TODO
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/README.md b/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/README.md
@@ -0,0 +1,7 @@
+# In-hospital Mortality Prediction
+
+Like in-ICU mortality prediction, this can be used as a signal of a patient's overall level of wellness or a
+proxy signal for the patient's need to be transferred to higher monitoring or care levels. It is often a good
+idea to separate this from post-discharge mortality prediction due to the vastly different intervention
+surfaces that could be applied in either an inpatient or outpatient setting as well as the differing likely
+clinical causes of risk of mortality in those settings.
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/first_24h.yaml b/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/first_24h.yaml
@@ -0,0 +1,43 @@
+description: >-
+  This file specifies the base configuration for the prediction of in hospital mortality, leveraging only the
+  first 24 hours of data after admission, with a 24 hour gap between the input window and the target window.
+  Patients who die or are discharged in the gap window are excluded. Note that this task is in-**hospital**
+  mortality, not in-**ICU** mortality or **30-day** mortality, which are different tasks.
+  Note that this task is predicting the outcome for a patient's entire hospital stay, not just the first 48
+  hours or the subsequent 48 hours after the trigger event. Imminent mortality (on various time-scales) are
+  different tasks.
+
+predicates:
+  hospital_admission:
+    code: { regex: "HOSPITAL_ADMISSION//.*" }
+  hospital_discharge:
+    code: { regex: "HOSPITAL_DISCHARGE//.*" }
+  death:
+    code: MEDS_DEATH
+  discharge_or_death:
+    expr: or(hospital_discharge, death)
+
+trigger: hospital_admission
+
+windows:
+  input:
+    start: NULL
+    end: trigger + 24h
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+  gap:
+    start: input.end
+    end: start + 24h
+    start_inclusive: False
+    end_inclusive: True
+    has:
+      hospital_admission: (None, 0)
+      hospital_discharge: (None, 0)
+      death: (None, 0)
+  target:
+    start: gap.end
+    end: start -> discharge_or_death
+    start_inclusive: False
+    end_inclusive: True
+    label: death
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/first_48h.yaml b/MIMICIV_TUTORIAL/tasks/mortality/in_hospital/first_48h.yaml
@@ -0,0 +1,43 @@
+description: >-
+  This file specifies the base configuration for the prediction of in hospital mortality, leveraging only the
+  first 48 hours of data after admission, with a 24 hour gap between the input window and the target window.
+  Patients who die or are discharged in the gap window are excluded. Note that this task is in-**hospital**
+  mortality, not in-**ICU** mortality or **30-day** mortality, which are different tasks.
+  Note that this task is predicting the outcome for a patient's entire hospital stay, not just the first 48
+  hours or the subsequent 48 hours after the trigger event. Imminent mortality (on various time-scales) are
+  different tasks.
+
+predicates:
+  hospital_admission:
+    code: { regex: "HOSPITAL_ADMISSION//.*" }
+  hospital_discharge:
+    code: { regex: "HOSPITAL_DISCHARGE//.*" }
+  death:
+    code: MEDS_DEATH
+  discharge_or_death:
+    expr: or(hospital_discharge, death)
+
+trigger: hospital_admission
+
+windows:
+  input:
+    start: NULL
+    end: trigger + 48h
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+  gap:
+    start: input.end
+    end: start + 24h
+    start_inclusive: False
+    end_inclusive: True
+    has:
+      hospital_admission: (None, 0)
+      hospital_discharge: (None, 0)
+      death: (None, 0)
+  target:
+    start: gap.end
+    end: start -> discharge_or_death
+    start_inclusive: False
+    end_inclusive: True
+    label: death
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/in_icu/README.md b/MIMICIV_TUTORIAL/tasks/mortality/in_icu/README.md
@@ -0,0 +1,6 @@
+# In-ICU Mortality Prediction
+
+In-ICU mortality prediction can be used as a proxy signal for the patient's overall level of wellness and thus
+effective monitoring needs. Likely due to the availability of this target, rather than the true possible
+intervention surface offered by this task, it has been a widely used prediction target in the ML for
+healthcare literature to date.
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/in_icu/first_24h.yaml b/MIMICIV_TUTORIAL/tasks/mortality/in_icu/first_24h.yaml
@@ -0,0 +1,65 @@
+# metadata:
+#  name: 24h ICU Mortality
+#  version: ...
+#  author: ...
+#  tags:
+#    - mortality_prediction
+#    - critical_care
+#    - post_admission_prediction
+#  description: ...
+
+description: >-
+  This file specifies the base configuration for the prediction of in ICU mortality, leveraging only the first
+  24 hours of data after ICU admission, with a 24 hour gap between the input window and the target window.
+  Patients who die or are discharged from the ICU in the gap window are excluded. Patients who die in the same
+  hospital stay but in a subsequent ICU stay are **not** positive labels for this task.
+  Note that this task is in-**ICU** mortality, not in-**hospital** mortality or **30-day** mortality, which
+  are different tasks.
+  Note that this task is predicting the outcome for a patient's entire ICU admission, not just the first 24
+  hours or the subsequent 24 hours after the trigger event. Imminent mortality (on various time-scales) are
+  different tasks.
+
+predicates:
+  hospital_admission:
+    code: { regex: "^HOSPITAL_ADMISSION//.*" }
+  hospital_discharge:
+    code: { regex: "^HOSPITAL_DISCHARGE//.*" }
+
+  ED_registration:
+    code: { regex: "^ED_REGISTRATION//.*" }
+  ED_discharge:
+    code: { regex: "^ED_OUT//.*" }
+
+  icu_admission:
+    code: { regex: "^ICU_ADMISSION//.*" }
+  icu_discharge:
+    code: { regex: "^ICU_DISCHARGE//.*" }
+
+  death:
+    code: MEDS_DEATH
+  discharge_or_death:
+    expr: or(icu_discharge, death, hospital_discharge)
+
+trigger: icu_admission
+
+windows:
+  input:
+    start: null
+    end: trigger + 24h
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+  gap:
+    start: trigger
+    end: start + 48h
+    start_inclusive: False
+    end_inclusive: True
+    has:
+      icu_admission: (None, 0)
+      discharge_or_death: (None, 0)
+  target:
+    start: gap.end
+    end: start -> discharge_or_death
+    start_inclusive: False
+    end_inclusive: True
+    label: death
diff --git a/MIMICIV_TUTORIAL/tasks/mortality/post_hospital_discharge/1y.yaml b/MIMICIV_TUTORIAL/tasks/mortality/post_hospital_discharge/1y.yaml
@@ -0,0 +1,24 @@
+predicates:
+  hospital_discharge:
+    code: { regex: "HOSPITAL_DISCHARGE//.*" }
+
+  death:
+    code: MEDS_DEATH
+
+trigger: hospital_discharge
+
+windows:
+  input:
+    start: NULL
+    end: trigger
+    start_inclusive: True
+    end_inclusive: True
+    index_timestamp: end
+    has:
+      death: (None, 0) # They must be alive at the time of hospital discharge
+  target:
+    start: trigger
+    end: start + 365d
+    start_inclusive: False
+    end_inclusive: True
+    label: death