Skip to content

Commit

Permalink
Initial
Browse files Browse the repository at this point in the history
  • Loading branch information
alexpaynter committed May 23, 2023
0 parents commit 40a476c
Show file tree
Hide file tree
Showing 5 changed files with 373 additions and 0 deletions.
89 changes: 89 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# History files
.Rhistory
.Rapp.history
.Rprofile

# Session Data files
.RData

# User-specific files
.Ruserdata

# Example code in package build process
*-Ex.R

# Output files from R CMD build
/*.tar.gz

# Output files from R CMD check
/*.Rcheck/

# RStudio files
.Rproj.user/

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# knitr and R markdown default cache directories
*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# R Environment Variables
.Renviron




# Copied the below items from https://github.com/Sage-Bionetworks/GENIE_BPC/blob/main/.gitignore

# ignore other types of data files
*.xlsx
*.xls
*.csv
*.dta
*.sas7bdat
*.sav
*.txt

# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth

# ignore non-code files
*.xlsx
*.csv
*.html
*.docx
*.pptx
*.rdata
*.pdf

# ignore misc files
~$*.docx
~$*.xlsx
~$*.pptx
~$*.rtf
~$*.tmp
Thumbs.db
.DS_Store
*.png

# Packrat
packrat/lib*/
packrat/src/

# renv
renv/library/
renv/python/
renv/staging/

# folders to ignore
data
data-raw
42 changes: 42 additions & 0 deletions R/format_ptlevel_naaccr_race_code_primary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Turns the variable into a factor for easier processing.
format_ptlevel_naaccr_race_code_primary <- function(col, drop_unused = T) {
f <- factor(col,
# pulled straight from AACR data guide:
levels = c("White",
"Black",
"American Indian, Aleutian, or Eskimo ",
"Chinese",
"Japanese",
"Filipino",
"Hawaiian",
"Korean",
"Vietnamese",
"Laotian",
"Hmong",
"Kampuchean (Cambodian)",
"Thai",
"Asian Indian or Pakistani NOS",
"Asian Indian",
"Pakistani",
"Micronesian NOS",
"Chamorro/Chamoru",
"Guamanian NOS",
"Polynesian NOS",
"Tahitian",
"Samoan",
"Tongan",
"Melanesian NOS",
"Fiji Islander",
"New Guinean",
"Other Asian",
"Pacific Islander NOS",
"Other",
"Unknown")
)

if (drop_unused) {
f <- forcats::fct_drop(f)
}

return(f)
}
194 changes: 194 additions & 0 deletions analysis/reports/bpc_bladder.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
---
title: "GENIE BPC Bladder"
author: "Alex Paynter"
date: "May 2023"
output:
html_document:
toc: true
toc_float: true
theme: paper
editor_options:
chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, include = F,
warning = F, message = F,
fig.width = 7, fig.height = 5)
k_dpi <- 150
```


<style type="text/css">
.main-container {
max-width: 1000px !important;
margin: auto;
}
</style>



```{r}
library(cli)
library(readr)
library(here)
library(dplyr)
library(tidyr)
library(purrr)
library(rlang)
library(magrittr)
library(janitor)
library(stringr)
library(vctrs)
library(glue)
library(ggplot2)
library(ggrepel)
library(gt)
library(gtsummary)
library(sunburstR)
library(huxtable)
as_list(here("R", dir(here("R"))))
# Source everything in the "R" folder - just functions, no scripts.
purrr::walk(.x = here("R", dir(here("R"))), .f = source)
```


```{r, set_gtsummary_theme}
theme_gtsummary_compact()
```


```{r, warning = T}
read_wrap <- function(p) {
read_csv(file = here("data-raw", p), show_col_types = F)
}
dft_pt <- read_wrap("patient_level_dataset.csv")
dft_ca_ind <- read_wrap("cancer_level_dataset_index.csv")
dft_img <- read_wrap("imaging_level_dataset.csv")
dft_med_onc <- read_wrap("med_onc_note_level_dataset.csv")
dft_path <- read_wrap("pathology_report_level_dataset.csv")
dft_regimens <- read_wrap("regimen_cancer_level_dataset.csv")
# dft_tm <- read_wrap("tm_level_dataset.csv")
dft_cpt <- read_wrap("cancer_panel_test_level_dataset.csv")
# A few sanity checks on the data:
if ((dft_pt$record_id %>% duplicated %>% any)) {
stop("Duplicated records in patient level dataset.")
}
# At the time we started working there was only one index cancer per
# record id. Double checking this as we go:
if ((dft_ca_ind %>%
count(record_id, ca_seq, sort = T) %>%
pull(n) %>% is_greater_than(1) %>% any)) {
stop("Some patients have >1 index cancer - adjust as needed.")
}
```

```{r, demo_data_manipulation, include = F}
dft_pt_baseline_sub <- dft_pt %>%
mutate(
`Race (primary)` = format_ptlevel_naaccr_race_code_primary(
naaccr_race_code_primary
),
`Ethnicity` = format_ptlevel_naaccr_ethnicity_code(
naaccr_ethnicity_code,
`Sex at birth` = naaccr_sex_code,
)) %>%
select(record_id,
Institution = institution,
`Race (primary)`,
`Ethnicity`,
`Sex at birth`)
dft_ca_ind_baseline_sub <- dft_ca_ind %>%
mutate(
ca_dx_how = format_ca_dx_how(ca_dx_how),
ca_hist_adeno_squamous = format_ca_hist_adeno_squamous(ca_hist_adeno_squamous),
stage_dx = format_stage_dx(stage_dx),
ca_path_n_stage = format_ca_path_n_stage(ca_path_n_stage)
) %>%
select(
record_id,
`Age at dx (years)` = dob_ca_dx_yrs,
`Stage at dx` = stage_dx,
`Source of dx` = ca_dx_how,
# Update May 5: Removed histology (unimportant)
# `Histology` = ca_hist_adeno_squamous, # at dx?
`Pathologic N Stage` = ca_path_n_stage # describes spread to lymph nodes
)
# On histology alternatives:
# - naaccr_histology_cd has mostly the same info according to https://seer.cancer.gov/icd-o-3/. Most of these levels are singletons.
dft_tm_baseline <- dft_tm %>%
psa_test_unit_convert() %>%
filter(!is.na(tm_num_result_conv)) %>%
# filter down to results at or on diagnosis.
# Importantly there are psa/test values available before diagnosis for
# many participants.
filter(dx_tm_days <= 30 & dx_tm_days >= -180) %>%
group_by(record_id, tm_type) %>%
# Prioritize values at/before
mutate(abs_dx_tm_days = abs(dx_tm_days)) %>%
arrange(abs_dx_tm_days) %>%
slice(1) %>%
ungroup() %>%
select(record_id,
tm_type,
tm_num_result_conv,
tm_result_units_conv)
# Visual check:
# ggplot(dft_tm_baseline, aes(x = dx_tm_days)) + stat_ecdf()
# kind of a hacky check: the resulting tibble will be 2 rows and 3 columns if there are 2 analytes with one unit each.
tm_unit_conv_check <- dft_tm_baseline %>%
tabyl(tm_type, tm_result_units_conv) %>%
dim
if (!all.equal(tm_unit_conv_check,c(2,3))) {
cli::abort("Unit conversion problem.")
}
dft_tm_baseline %<>%
select(-tm_result_units_conv) %>%
tidyr::pivot_wider(
names_from = "tm_type",
values_from = "tm_num_result_conv"
) %>%
rename(
`PSA (ng/mL)` = PSA,
`Testosterone (ng/dL)` = Testosterone
)
dft_demo <- full_join(
dft_pt_baseline_sub,
dft_ca_ind_baseline_sub,
by = "record_id"
) %>%
full_join(
.,
dft_tm_baseline,
by = "record_id"
)
```

35 changes: 35 additions & 0 deletions analysis/scripts/get_raw_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Description: Grabs the raw data from Synapse and stores it in the data-raw folder.
# Author: Alex Paynter

# The Synapse folder containing the clinical data files.
synid_clin_data <- "syn28495599" # "syn50612196"

library(cli)
library(synapser)
library(purrr)
library(dplyr)
library(here)
library(stringr)
library(magrittr)

synLogin()

# create directories for data and data-raw
dir.create(here("data"), showWarnings = F)
dir.create(here("data-raw"), showWarnings = F)

df_clin_children <- synGetChildren(synid_clin_data) %>%
as.list %>%
purrr::map_dfr(.x = .,
.f = as_tibble)

if (any(stringr::str_detect(df_clin_children$name, ".csv^"))) {
warning("Non-CSV files unexpectedly contained in {synid_clin_data}.")
}

syn_store_in_dataraw <- function(sid) {
synGet(entity = sid, downloadLocation = here("data-raw"))
}

purrr::walk(.x = df_clin_children$id,
.f = syn_store_in_dataraw)
13 changes: 13 additions & 0 deletions genie-bpc-bladder-landscape-manu.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

0 comments on commit 40a476c

Please sign in to comment.