Initial

Sage-Bionetworks · May 23, 2023 · 40a476c · 40a476c
commit 40a476c
Show file tree

Hide file tree

Showing 5 changed files with 373 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,89 @@
+# History files
+.Rhistory
+.Rapp.history
+.Rprofile
+
+# Session Data files
+.RData
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+
+
+
+# Copied the below items from https://github.com/Sage-Bionetworks/GENIE_BPC/blob/main/.gitignore
+
+# ignore other types of data files
+*.xlsx
+*.xls
+*.csv
+*.dta
+*.sas7bdat
+*.sav
+*.txt
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# ignore non-code files
+*.xlsx
+*.csv
+*.html
+*.docx
+*.pptx
+*.rdata
+*.pdf
+
+# ignore misc files
+~$*.docx
+~$*.xlsx
+~$*.pptx
+~$*.rtf
+~$*.tmp
+Thumbs.db
+.DS_Store
+*.png
+
+# Packrat
+packrat/lib*/
+packrat/src/
+
+# renv
+renv/library/
+renv/python/
+renv/staging/
+
+# folders to ignore
+data
+data-raw
diff --git a/R/format_ptlevel_naaccr_race_code_primary.R b/R/format_ptlevel_naaccr_race_code_primary.R
@@ -0,0 +1,42 @@
+# Turns the variable into a factor for easier processing.
+format_ptlevel_naaccr_race_code_primary <- function(col, drop_unused = T) {
+  f <- factor(col,
+              # pulled straight from AACR data guide:
+              levels = c("White",
+                         "Black",
+                         "American Indian, Aleutian, or Eskimo  ",
+                         "Chinese",
+                         "Japanese",
+                         "Filipino",
+                         "Hawaiian",
+                         "Korean",
+                         "Vietnamese",
+                         "Laotian",
+                         "Hmong",
+                         "Kampuchean (Cambodian)",
+                         "Thai",
+                         "Asian Indian or Pakistani NOS",
+                         "Asian Indian",
+                         "Pakistani",
+                         "Micronesian NOS",
+                         "Chamorro/Chamoru",
+                         "Guamanian NOS",
+                         "Polynesian NOS",
+                         "Tahitian",
+                         "Samoan",
+                         "Tongan",
+                         "Melanesian NOS",
+                         "Fiji Islander",
+                         "New Guinean",
+                         "Other Asian",
+                         "Pacific Islander NOS",
+                         "Other",
+                         "Unknown")
+  )
+
+  if (drop_unused) {
+    f <- forcats::fct_drop(f)
+  }
+
+  return(f)
+}
diff --git a/analysis/reports/bpc_bladder.Rmd b/analysis/reports/bpc_bladder.Rmd
@@ -0,0 +1,194 @@
+---
+title: "GENIE BPC Bladder"
+author: "Alex Paynter"
+date: "May 2023"
+output: 
+  html_document:
+    toc: true
+    toc_float: true
+    theme: paper
+editor_options: 
+  chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = F, include = F,
+                      warning = F, message = F,
+                      fig.width = 7, fig.height = 5)
+k_dpi <- 150
+```
+
+
+<style type="text/css">
+.main-container {
+  max-width: 1000px !important;
+  margin: auto;
+}
+</style>
+
+
+
+```{r}
+library(cli)
+library(readr)
+library(here)
+library(dplyr)
+library(tidyr)
+library(purrr)
+library(rlang)
+library(magrittr)
+library(janitor)
+library(stringr)
+library(vctrs)
+library(glue)
+
+library(ggplot2)
+library(ggrepel)
+
+library(gt)
+library(gtsummary)
+library(sunburstR)
+library(huxtable)
+
+
+as_list(here("R", dir(here("R"))))
+
+# Source everything in the "R" folder - just functions, no scripts.
+purrr::walk(.x = here("R", dir(here("R"))), .f = source)
+```
+
+
+```{r, set_gtsummary_theme}
+theme_gtsummary_compact()
+```
+
+
+```{r, warning = T}
+read_wrap <- function(p) {
+  read_csv(file = here("data-raw", p), show_col_types = F)
+}
+
+dft_pt <- read_wrap("patient_level_dataset.csv")
+dft_ca_ind <- read_wrap("cancer_level_dataset_index.csv")
+dft_img <- read_wrap("imaging_level_dataset.csv")
+dft_med_onc <- read_wrap("med_onc_note_level_dataset.csv")
+dft_path <- read_wrap("pathology_report_level_dataset.csv")
+dft_regimens <- read_wrap("regimen_cancer_level_dataset.csv")
+# dft_tm <- read_wrap("tm_level_dataset.csv")
+dft_cpt <- read_wrap("cancer_panel_test_level_dataset.csv")
+
+# A few sanity checks on the data:
+if ((dft_pt$record_id %>% duplicated %>% any)) {
+  stop("Duplicated records in patient level dataset.")
+}
+# At the time we started working there was only one index cancer per
+#  record id.  Double checking this as we go:
+if ((dft_ca_ind %>% 
+     count(record_id, ca_seq, sort = T) %>%
+     pull(n) %>% is_greater_than(1) %>% any)) {
+  stop("Some patients have >1 index cancer - adjust as needed.")
+}
+```
+
+```{r, demo_data_manipulation, include = F}
+dft_pt_baseline_sub <- dft_pt %>%
+  mutate(
+    `Race (primary)` = format_ptlevel_naaccr_race_code_primary(
+      naaccr_race_code_primary
+    ),
+    `Ethnicity` = format_ptlevel_naaccr_ethnicity_code(
+      naaccr_ethnicity_code,
+    `Sex at birth` = naaccr_sex_code,
+    )) %>%
+  select(record_id, 
+         Institution = institution,
+         `Race (primary)`,
+         `Ethnicity`,
+         `Sex at birth`)
+
+
+dft_ca_ind_baseline_sub <- dft_ca_ind %>%
+  mutate(
+    ca_dx_how = format_ca_dx_how(ca_dx_how),
+    ca_hist_adeno_squamous = format_ca_hist_adeno_squamous(ca_hist_adeno_squamous),
+    stage_dx = format_stage_dx(stage_dx),
+    ca_path_n_stage = format_ca_path_n_stage(ca_path_n_stage)
+  ) %>%
+  select(
+    record_id,
+    `Age at dx (years)` = dob_ca_dx_yrs,
+    `Stage at dx` = stage_dx,
+    `Source of dx` = ca_dx_how,
+    # Update May 5:  Removed histology (unimportant)
+    # `Histology` = ca_hist_adeno_squamous, # at dx?
+    `Pathologic N Stage` = ca_path_n_stage # describes spread to lymph nodes
+  ) 
+
+# On histology alternatives:
+# - naaccr_histology_cd has mostly the same info according to https://seer.cancer.gov/icd-o-3/.  Most of these levels are singletons.
+
+
+
+dft_tm_baseline <- dft_tm %>%
+  psa_test_unit_convert() %>%
+  filter(!is.na(tm_num_result_conv)) %>%
+  # filter down to results at or on diagnosis.
+  # Importantly there are psa/test values available before diagnosis for
+  #   many participants.
+  filter(dx_tm_days <= 30 & dx_tm_days >= -180) %>% 
+  group_by(record_id, tm_type) %>%
+  # Prioritize values at/before 
+  mutate(abs_dx_tm_days = abs(dx_tm_days)) %>%
+  arrange(abs_dx_tm_days) %>%
+  slice(1) %>%
+  ungroup() %>% 
+  select(record_id, 
+         tm_type, 
+         tm_num_result_conv, 
+         tm_result_units_conv) 
+
+# Visual check:
+# ggplot(dft_tm_baseline, aes(x = dx_tm_days)) + stat_ecdf()
+
+
+# kind of a hacky check:  the resulting tibble will be 2 rows and 3 columns if there are 2 analytes with one unit each.
+tm_unit_conv_check <- dft_tm_baseline %>% 
+  tabyl(tm_type, tm_result_units_conv) %>% 
+  dim
+if (!all.equal(tm_unit_conv_check,c(2,3))) {
+  cli::abort("Unit conversion problem.")
+}
+
+dft_tm_baseline %<>% 
+  select(-tm_result_units_conv) %>%
+  tidyr::pivot_wider(
+    names_from = "tm_type",
+    values_from = "tm_num_result_conv"
+  ) %>%
+  rename(
+    `PSA (ng/mL)` = PSA,
+    `Testosterone (ng/dL)` = Testosterone
+  )
+
+
+
+
+
+
+
+
+
+
+dft_demo <- full_join(
+  dft_pt_baseline_sub,
+  dft_ca_ind_baseline_sub,
+  by = "record_id"
+) %>%
+  full_join(
+    .,
+    dft_tm_baseline,
+    by = "record_id"
+  )
+
+```
+
diff --git a/analysis/scripts/get_raw_data.R b/analysis/scripts/get_raw_data.R
@@ -0,0 +1,35 @@
+# Description: Grabs the raw data from Synapse and stores it in the data-raw folder.
+# Author: Alex Paynter
+
+# The Synapse folder containing the clinical data files.
+synid_clin_data <- "syn28495599"  # "syn50612196"
+
+library(cli)
+library(synapser)
+library(purrr)
+library(dplyr)
+library(here)
+library(stringr)
+library(magrittr)
+
+synLogin()
+
+# create directories for data and data-raw
+dir.create(here("data"), showWarnings = F)
+dir.create(here("data-raw"), showWarnings = F)
+
+df_clin_children <- synGetChildren(synid_clin_data) %>%
+  as.list %>%
+  purrr::map_dfr(.x = .,
+                 .f = as_tibble)
+
+if (any(stringr::str_detect(df_clin_children$name, ".csv^"))) {
+  warning("Non-CSV files unexpectedly contained in {synid_clin_data}.")
+}
+
+syn_store_in_dataraw <- function(sid) {
+  synGet(entity = sid, downloadLocation = here("data-raw"))
+}
+
+purrr::walk(.x = df_clin_children$id, 
+            .f = syn_store_in_dataraw)
diff --git a/genie-bpc-bladder-landscape-manu.Rproj b/genie-bpc-bladder-landscape-manu.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX