From 1a5b8bdaf577fa34a503a2fa0a806e45b0839606 Mon Sep 17 00:00:00 2001 From: Damon Bayer Date: Mon, 6 Jan 2025 20:42:46 -0600 Subject: [PATCH] Create combined epiweekly data (#274) --- pipelines/generate_epiweekly.R | 93 ++++++++++++++++++++++++++++++++-- pipelines/prep_data.py | 3 +- pipelines/prep_eval_data.py | 2 +- 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/pipelines/generate_epiweekly.R b/pipelines/generate_epiweekly.R index 5953b6d3..66c788f4 100644 --- a/pipelines/generate_epiweekly.R +++ b/pipelines/generate_epiweekly.R @@ -4,7 +4,8 @@ script_packages <- c( "dplyr", "forecasttools", "fs", - "readr" + "readr", + "lubridate" ) ## load in packages without messages @@ -81,9 +82,95 @@ convert_daily_to_epiweekly <- function( write_delim(epiweekly_data, output_file, delim = delim) } +convert_comb_daily_to_ewkly <- function( + model_run_dir, dataname = "combined_training_data.tsv", + strict = FALSE, day_of_week = 7) { + message(glue::glue("Generating epi-weekly data {model_run_dir}...")) + + data_basename <- path_ext_remove(dataname) + data_path <- path(model_run_dir, "data", dataname) + + raw_data <- read_tsv( + data_path, + col_types = cols( + date = col_date(), + geo_value = col_character(), + disease = col_character(), + data_type = col_character(), + value_type = col_character(), + value = col_double() + ) + ) + + daily_ed_visit_data <- raw_data |> + filter(value_type == "ed_visits") + + ewkly_hospital_admission_data <- raw_data |> + filter(value_type == "hospital_admissions") |> + mutate( + epiweek = epiweek(date), + epiyear = epiyear(date) + ) + + # Verify hospital admissions dates are epiweekly + invalid_dates <- + ewkly_hospital_admission_data |> + mutate(implied_date = epiweek_to_date(epiweek, + epiyear, + day_of_week = day_of_week + )) |> + filter(date != implied_date) |> + pull(date) + + if (length(invalid_dates) > 0) { + stop(glue::glue( + "Invalid dates found in hospital admissions data: ", + "{paste0(invalid_dates, collapse = ', ')}" + )) + } + + epiweekly_ed_visit_data <- daily_ed_visit_data |> + forecasttools::daily_to_epiweekly( + value_col = "value", + weekly_value_name = "value", + id_cols = c("disease", "geo_value", "data_type", "value_type"), + strict = strict + ) |> + mutate(date = epiweek_to_date(epiweek, + epiyear, + day_of_week = day_of_week + )) + + epiweekly_data <- bind_rows( + epiweekly_ed_visit_data, + ewkly_hospital_admission_data + ) |> + arrange(date, value_type, disease) |> + select(date, everything()) + + output_file <- path( + model_run_dir, "data", + glue::glue("epiweekly_{data_basename}"), + ext = "tsv" + ) + + write_tsv(epiweekly_data, output_file) +} + + main <- function(model_run_dir) { - convert_daily_to_epiweekly(model_run_dir, dataname = "data.tsv") - convert_daily_to_epiweekly(model_run_dir, dataname = "eval_data.tsv") + convert_daily_to_epiweekly(model_run_dir, + dataname = "data.tsv" + ) + convert_daily_to_epiweekly(model_run_dir, + dataname = "eval_data.tsv" + ) + convert_comb_daily_to_ewkly(model_run_dir, + dataname = "combined_training_data.tsv" + ) + convert_comb_daily_to_ewkly(model_run_dir, + dataname = "combined_eval_data.tsv" + ) } # Create a parser diff --git a/pipelines/prep_data.py b/pipelines/prep_data.py index 352ec82b..5900adf0 100644 --- a/pipelines/prep_data.py +++ b/pipelines/prep_data.py @@ -94,7 +94,6 @@ def combine_nssp_and_nhsn( ) .with_columns( pl.lit(disease).alias("disease"), - pl.lit("train").alias("data_type"), ) ) @@ -379,7 +378,7 @@ def process_and_save_state( end_date=last_training_date, disease=disease, state_abb=state_abb, - ) + ).with_columns(pl.lit("train").alias("data_type")) nssp_training_dates = ( nssp_training_data.get_column("date").unique().to_list() diff --git a/pipelines/prep_eval_data.py b/pipelines/prep_eval_data.py index f8eb0919..39673729 100644 --- a/pipelines/prep_eval_data.py +++ b/pipelines/prep_eval_data.py @@ -49,7 +49,7 @@ def save_eval_data( end_date=last_training_date, disease=disease, state_abb=state, - ) + ).with_columns(data_type=pl.lit("eval")) combined_eval_dat = combine_nssp_and_nhsn( nssp_data=nssp_data,