diff --git a/DESCRIPTION b/DESCRIPTION index 017084322..1b52822a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.6 +Version: 0.1.7 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index f710c2842..27ed2bf9f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -35,6 +35,8 @@ R -e 'devtools::document()' R -e 'pkgdown::build_site()' ``` +Note that sometimes the caches from either `pkgdown` or `knitr` can cause difficulties. To clear those, run `make`, with either `clean_knitr`, `clean_site`, or `clean` (which does both). + If you work without R Studio and want to iterate on documentation, you might find [this script](https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621) diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..9f5790aca --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +## +# epipredict docs build +# + +# knitr doesn't actually clean it's own cache properly; this just deletes any of +# the article knitr caches in vignettes or the base +clean_knitr: + rm -r *_cache; rm -r vignettes/*_cache +clean_site: + Rscript -e "pkgdown::clean_cache(); pkgdown::clean_site()" +# this combines +clean: clean_knitr clean_site + +# end diff --git a/R/autoplot.R b/R/autoplot.R index 870dcb8d8..4164d6fdb 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -16,6 +16,8 @@ ggplot2::autoplot #' @param object An `epi_workflow` #' @param predictions A data frame with predictions. If `NULL`, only the #' original data is shown. +#' @param plot_data An epi_df of the data to plot against. This is for the case +#' where you have the actual results to compare the forecast against. #' @param .levels A numeric vector of levels to plot for any prediction bands. #' More than 3 levels begins to be difficult to see. #' @param ... Ignored @@ -84,7 +86,9 @@ NULL #' @export #' @rdname autoplot-epipred autoplot.epi_workflow <- function( - object, predictions = NULL, + object, + predictions = NULL, + plot_data = NULL, .levels = c(.5, .8, .95), ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), @@ -111,30 +115,32 @@ autoplot.epi_workflow <- function( } keys <- c("geo_value", "time_value", "key") mold_roles <- names(mold$extras$roles) - edf <- bind_cols(mold$extras$roles[mold_roles %in% keys], y) - if (starts_with_impl("ahead_", names(y))) { - old_name_y <- unlist(strsplit(names(y), "_")) - shift <- as.numeric(old_name_y[2]) - new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_") - edf <- rename(edf, !!new_name_y := !!names(y)) - } else if (starts_with_impl("lag_", names(y))) { - old_name_y <- unlist(strsplit(names(y), "_")) - shift <- -as.numeric(old_name_y[2]) - new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_") - edf <- rename(edf, !!new_name_y := !!names(y)) - } - - if (!is.null(shift)) { - edf <- mutate(edf, time_value = time_value + shift) + # extract the relevant column names for plotting + old_name_y <- unlist(strsplit(names(y), "_")) + new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_") + if (is.null(plot_data)) { + # the outcome has shifted, so we need to shift it forward (or back) + # by the corresponding amount + plot_data <- bind_cols(mold$extras$roles[mold_roles %in% keys], y) + if (starts_with_impl("ahead_", names(y))) { + shift <- as.numeric(old_name_y[2]) + } else if (starts_with_impl("lag_", names(y))) { + old_name_y <- unlist(strsplit(names(y), "_")) + shift <- -as.numeric(old_name_y[2]) + } + plot_data <- rename(plot_data, !!new_name_y := !!names(y)) + if (!is.null(shift)) { + plot_data <- mutate(plot_data, time_value = time_value + shift) + } + other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) + plot_data <- as_epi_df(plot_data, + as_of = object$fit$meta$as_of, + other_keys = other_keys + ) } - other_keys <- setdiff(key_colnames(object), c("geo_value", "time_value")) - edf <- as_epi_df(edf, - as_of = object$fit$meta$as_of, - other_keys = other_keys - ) if (is.null(predictions)) { return(autoplot( - edf, new_name_y, + plot_data, new_name_y, .color_by = .color_by, .facet_by = .facet_by, .base_color = .base_color, .max_facets = .max_facets )) @@ -146,27 +152,27 @@ autoplot.epi_workflow <- function( } predictions <- rename(predictions, time_value = target_date) } - pred_cols_ok <- hardhat::check_column_names(predictions, key_colnames(edf)) + pred_cols_ok <- hardhat::check_column_names(predictions, key_colnames(plot_data)) if (!pred_cols_ok$ok) { cli_warn(c( "`predictions` is missing required variables: {.var {pred_cols_ok$missing_names}}.", i = "Plotting the original data." )) return(autoplot( - edf, !!new_name_y, + plot_data, !!new_name_y, .color_by = .color_by, .facet_by = .facet_by, .base_color = .base_color, .max_facets = .max_facets )) } # First we plot the history, always faceted by everything - bp <- autoplot(edf, !!new_name_y, + bp <- autoplot(plot_data, !!new_name_y, .color_by = "none", .facet_by = "all_keys", .base_color = "black", .max_facets = .max_facets ) # Now, prepare matching facets in the predictions - ek <- epi_keys_only(edf) + ek <- epi_keys_only(plot_data) predictions <- predictions %>% mutate( .facets = interaction(!!!rlang::syms(as.list(ek)), sep = "/"), @@ -204,7 +210,7 @@ autoplot.epi_workflow <- function( #' @export #' @rdname autoplot-epipred autoplot.canned_epipred <- function( - object, ..., + object, plot_data = NULL, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "dodgerblue4", @@ -218,7 +224,7 @@ autoplot.canned_epipred <- function( predictions <- object$predictions %>% rename(time_value = target_date) - autoplot(ewf, predictions, + autoplot(ewf, predictions, plot_data, ..., .color_by = .color_by, .facet_by = .facet_by, .base_color = .base_color, .max_facets = .max_facets ) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 7efda3efd..15d276393 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -16,7 +16,7 @@ #' #' @param epi_data An [epiprocess::epi_df][epiprocess::as_epi_df] #' @param outcome A scalar character for the column name we wish to predict. -#' @param args_list A list of dditional arguments as created by the +#' @param args_list A list of additional arguments as created by the #' [flatline_args_list()] constructor function. #' #' @return A data frame of point (and optionally interval) forecasts at a single diff --git a/README.Rmd b/README.Rmd index 73cedbeaa..9f27ce668 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,24 +7,96 @@ output: github_document ```{r, include = FALSE} options(width = 76) knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", fig.path = "man/figures/README-", - out.width = "100%" + digits = 3, + comment = "#>", + collapse = TRUE, + cache = TRUE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 ) ``` +```{r pkgs, include=FALSE, echo=FALSE} +library(epipredict) +library(epidatr) +library(data.table) +library(dplyr) +library(tidyr) +library(ggplot2) +library(magrittr) +library(purrr) +library(scales) +``` -# epipredict +```{r coloration, include=FALSE, echo=FALSE} +base <- "#002676" +primary <- "#941120" +secondary <- "#f9c80e" +tertiary <- "#177245" +fourth_colour <- "#A393BF" +fifth_colour <- "#2e8edd" +colvec <- c( + base = base, primary = primary, secondary = secondary, + tertiary = tertiary, fourth_colour = fourth_colour, + fifth_colour = fifth_colour +) +library(epiprocess) +suppressMessages(library(tidyverse)) +theme_update(legend.position = "bottom", legend.title = element_blank()) +delphi_pal <- function(n) { + if (n > 6L) warning("Not enough colors in this palette!") + unname(colvec)[1:n] +} +scale_fill_delphi <- function(..., aesthetics = "fill") { + discrete_scale(aesthetics = aesthetics, palette = delphi_pal, ...) +} +scale_color_delphi <- function(..., aesthetics = "color") { + discrete_scale(aesthetics = aesthetics, palette = delphi_pal, ...) +} +scale_colour_delphi <- scale_color_delphi +``` + +# Epipredict [![R-CMD-check](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml) -**Note:** This package is currently in development and may not work as expected. Please file bug reports as issues in this repo, and we will do our best to address them quickly. +Epipredict is a framework for building transformation and forecasting pipelines +for epidemiological and other panel time-series datasets. +In addition to tools for building forecasting pipelines, it contains a number of +"canned" forecasters meant to run with little modification as an easy way to get +started forecasting. + +It is designed to work well with +[`epiprocess`](https://cmu-delphi.github.io/epiprocess/), a utility for handling +various time series and geographic processing tools in an epidemiological +context. +Both of the packages are meant to work well with the panel data provided by +[`epidatr`](https://cmu-delphi.github.io/epidatr/). + +If you are looking for more detail beyond the package documentation, see our +[forecasting book](https://cmu-delphi.github.io/delphi-tooling-book/). ## Installation -To install (unless you're making changes to the package, use the stable version): +To install (unless you're planning on contributing to package development, we +suggest using the stable version): ```r # Stable version @@ -34,93 +106,255 @@ pak::pkg_install("cmu-delphi/epipredict@main") pak::pkg_install("cmu-delphi/epipredict@dev") ``` -## Documentation +The documentation for the stable version is at +, while the development version is at +. -You can view documentation for the `main` branch at . -## Goals for `epipredict` +## Motivating example -**We hope to provide:** +To demonstrate the kind of forecast epipredict can make, say we're predicting +COVID deaths per 100k for each state on -1. A set of basic, easy-to-use forecasters that work out of the box. You should be able to do a reasonably limited amount of customization on them. For the basic forecasters, we currently provide: - * Baseline flatline forecaster - * Autoregressive forecaster - * Autoregressive classifier - * CDC FluSight flatline forecaster -2. A framework for creating custom forecasters out of modular components. There are four types of components: - * Preprocessor: do things to the data before model training - * Trainer: train a model on data, resulting in a fitted model object - * Predictor: make predictions, using a fitted model object - * Postprocessor: do things to the predictions before returning +```{r fc_date} +forecast_date <- as.Date("2021-08-01") +``` -**Target audiences:** +Below the fold, we construct this dataset as an `epiprocess::epi_df` from JHU +data. -* Basic. Has data, calls forecaster with default arguments. -* Intermediate. Wants to examine changes to the arguments, take advantage of -built in flexibility. -* Advanced. Wants to write their own forecasters. Maybe willing to build up -from some components. +
+ Creating the dataset using `{epidatr}` and `{epiprocess}` -The Advanced user should find their task to be relatively easy. Examples of -these tasks are illustrated in the [vignettes and articles](https://cmu-delphi.github.io/epipredict). +This dataset can be found in the package as `covid_case_death_rates`; we +demonstrate some of the typically ubiquitous cleaning operations needed to be +able to forecast. +First we pull both jhu-csse cases and deaths from +[`{epidatr}`](https://cmu-delphi.github.io/epidatr/) package: -See also the (in progress) [Forecasting Book](https://cmu-delphi.github.io/delphi-tooling-book/). +```{r case_death, warning = FALSE} +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, case_rate = value) -## Intermediate example +deaths <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, death_rate = value) +``` -The package comes with some built-in historical data for illustration, but -up-to-date versions of this could be downloaded with the -[`{epidatr}` package](https://cmu-delphi.github.io/epidatr/) -and processed using -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^1] +Since visualizing the results on every geography is somewhat overwhelming, +we'll only train on a subset of 5. +```{r date, warning = FALSE} +used_locations <- c("ca", "ma", "ny", "tx") +cases_deaths <- + full_join(cases, deaths, by = c("time_value", "geo_value")) |> + filter(geo_value %in% used_locations) |> + as_epi_df(as_of = as.Date("2022-01-01")) +# plotting the data as it was downloaded +cases_deaths |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` -[^1]: Other epidemiological signals for non-Covid related illnesses are also -available with [`{epidatr}`](https://github.com/cmu-delphi/epidatr) which -interfaces directly to Delphi's -[Epidata API](https://cmu-delphi.github.io/delphi-epidata/) +As with basically any dataset, there is some cleaning that we will need to do to +make it actually usable; we'll use some utilities from +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) for this. -```{r epidf, message=FALSE} -library(epipredict) -covid_case_death_rates +First, to eliminate some of the noise coming from daily reporting, we do 7 day +averaging over a trailing window[^1]: + +[^1]: This makes it so that any given day of the processed time-series only + depends on the previous week, which means that we avoid leaking future + values when making a forecast. + +```{r smooth} +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + epi_slide( + cases_7dav = mean(case_rate, na.rm = TRUE), + death_rate_7dav = mean(death_rate, na.rm = TRUE), + .window_size = 7 + ) |> + ungroup() |> + mutate(case_rate = NULL, death_rate = NULL) |> + rename(case_rate = cases_7dav, death_rate = death_rate_7dav) +``` + +Then trimming outliers, most especially negative values: + +```{r outlier} +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + mutate( + outlr_death_rate = detect_outlr_rm( + time_value, death_rate, + detect_negatives = TRUE + ), + outlr_case_rate = detect_outlr_rm( + time_value, case_rate, + detect_negatives = TRUE + ) + ) |> + unnest(cols = starts_with("outlr"), names_sep = "_") |> + ungroup() |> + mutate( + death_rate = outlr_death_rate_replacement, + case_rate = outlr_case_rate_replacement + ) |> + select(geo_value, time_value, case_rate, death_rate) +``` +
+ +After having downloaded and cleaned the data in `cases_deaths`, we plot a subset +of the states, noting the actual forecast date: + +
+ Plot +```{r plot_locs} +forecast_date_label <- + tibble( + geo_value = rep(used_locations, 2), + .response_name = c(rep("case_rate", 4), rep("death_rate", 4)), + dates = rep(forecast_date - 7 * 2, 2 * length(used_locations)), + heights = c(rep(150, 4), rep(0.75, 4)) + ) +processed_data_plot <- + covid_case_death_rates |> + filter(geo_value %in% used_locations) |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free" + ) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label, + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` +
+```{r show-processed-data, warning=FALSE, echo=FALSE} +processed_data_plot ``` -To create and train a simple auto-regressive forecaster to predict the death rate two weeks into the future using past (lagged) deaths and cases, we could use the following function. +To make a forecast, we will use a "canned" simple auto-regressive forecaster to +predict the death rate four weeks into the future using lagged[^3] deaths and +cases + +[^3]: lagged by 3 in this context meaning using the value from 3 days ago. ```{r make-forecasts, warning=FALSE} -two_week_ahead <- arx_forecaster( - covid_case_death_rates, +four_week_ahead <- arx_forecaster( + cases_deaths |> filter(time_value <= forecast_date), outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), - ahead = 14 + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) ) ) -two_week_ahead +four_week_ahead ``` -In this case, we have used a number of different lags for the case rate, while -only using 3 weekly lags for the death rate (as predictors). The result is both -a fitted model object which could be used any time in the future to create -different forecasts, as well as a set of predicted values (and prediction -intervals) for each location 14 days after the last available time value in the -data. +In this case, we have used 0-3 days, a week, and two week lags for the case +rate, while using only zero, one and two weekly lags for the death rate (as +predictors). +The result `four_week_ahead` is both a fitted model object which could be used +any time in the future to create different forecasts, as well as a set of +predicted values (and prediction intervals) for each location 28 days after the +forecast date. +Plotting the prediction intervals on our subset above[^2]: -```{r print-model} -two_week_ahead$epi_workflow +[^2]: Alternatively, you could call `autoplot(four_week_ahead)` to get the full + collection of forecasts. This is too busy for the space we have for plotting + here. + +
+ Plot +```{r plotting_forecast, warning=FALSE} +epiworkflow <- four_week_ahead$epi_workflow +restricted_predictions <- + four_week_ahead$predictions |> + rename(time_value = target_date, value = .pred) |> + mutate(.response_name = "death_rate") +forecast_plot <- + four_week_ahead |> + autoplot(plot_data = cases_deaths) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label %>% filter(.response_name == "death_rate"), + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` +
-The fitted model here involved preprocessing the data to appropriately generate -lagged predictors, estimating a linear model with `stats::lm()` and then -postprocessing the results to be meaningful for epidemiological tasks. We can -also examine the predictions. +```{r show-single-forecast, warning=FALSE, echo=FALSE} +forecast_plot +``` -```{r show-preds} -two_week_ahead$predictions +And as a tibble of quantile level-value pairs: +```{r pivot_wider} +four_week_ahead$predictions |> + select(-.pred) |> + pivot_quantiles_longer(.pred_distn) ``` -The results above show a distributional forecast produced using data through -the end of 2021 for the 14th of January 2022. A prediction for the death rate -per 100K inhabitants is available for every state (`geo_value`) along with a -90% predictive interval. +The black dot gives the median prediction, while the blue intervals give the +25-75%, the 10-90%, and 2.5-97.5% inter-quantile ranges[^4]. +For this particular day and these locations, the forecasts are relatively +accurate, with the true data being at least within the 10-90% interval. +A couple of things to note: + +1. Our methods are primarily direct forecasters; this means we don't need to + predict 1, 2,..., 27 days ahead to then predict 28 days ahead +2. All of our existing engines are geo-pooled, meaning the training data is + shared across geographies. This has the advantage of increasing the amount of + available training data, with the restriction that the data needs to be on + comparable scales, such as rates. + +## Getting Help +If you encounter a bug or have a feature request, feel free to file an [issue on +our github page](https://github.com/cmu-delphi/epipredict/issues). +For other questions, feel free to reach out to the authors, either via this +[contact +form](https://docs.google.com/forms/d/e/1FAIpQLScqgT1fKZr5VWBfsaSp-DNaN03aV6EoZU4YljIzHJ1Wl_zmtg/viewform), +email, or the Insightnet slack. +[^4]: Note that these are not the same quantiles that we fit when creating + `four_week_ahead`. They are extrapolated from those quantiles using `extrapolate_quantiles()` (which assumes an exponential decay in the tails). diff --git a/README.md b/README.md index 5293a0e9c..fdfa37c72 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,34 @@ -# epipredict +# Epipredict [![R-CMD-check](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epipredict/actions/workflows/R-CMD-check.yaml) -**Note:** This package is currently in development and may not work as -expected. Please file bug reports as issues in this repo, and we will do -our best to address them quickly. +Epipredict is a framework for building transformation and forecasting +pipelines for epidemiological and other panel time-series datasets. In +addition to tools for building forecasting pipelines, it contains a +number of “canned” forecasters meant to run with little modification as +an easy way to get started forecasting. + +It is designed to work well with +[`epiprocess`](https://cmu-delphi.github.io/epiprocess/), a utility for +handling various time series and geographic processing tools in an +epidemiological context. Both of the packages are meant to work well +with the panel data provided by +[`epidatr`](https://cmu-delphi.github.io/epidatr/). + +If you are looking for more detail beyond the package documentation, see +our [forecasting +book](https://cmu-delphi.github.io/delphi-tooling-book/). ## Installation -To install (unless you’re making changes to the package, use the stable -version): +To install (unless you’re planning on contributing to package +development, we suggest using the stable version): ``` r # Stable version @@ -25,189 +38,294 @@ pak::pkg_install("cmu-delphi/epipredict@main") pak::pkg_install("cmu-delphi/epipredict@dev") ``` -## Documentation +The documentation for the stable version is at +, while the development version +is at . + +## Motivating example -You can view documentation for the `main` branch at -. +To demonstrate the kind of forecast epipredict can make, say we’re +predicting COVID deaths per 100k for each state on + +``` r +forecast_date <- as.Date("2021-08-01") +``` -## Goals for `epipredict` +Below the fold, we construct this dataset as an `epiprocess::epi_df` +from JHU data. -**We hope to provide:** +
+ +Creating the dataset using `{epidatr}` and `{epiprocess}` + + +This dataset can be found in the package as `covid_case_death_rates`; we +demonstrate some of the typically ubiquitous cleaning operations needed +to be able to forecast. First we pull both jhu-csse cases and deaths +from [`{epidatr}`](https://cmu-delphi.github.io/epidatr/) package: + +``` r +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, case_rate = value) + +deaths <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, death_rate = value) +``` -1. A set of basic, easy-to-use forecasters that work out of the box. - You should be able to do a reasonably limited amount of - customization on them. For the basic forecasters, we currently - provide: - - Baseline flatline forecaster - - Autoregressive forecaster - - Autoregressive classifier - - CDC FluSight flatline forecaster -2. A framework for creating custom forecasters out of modular - components. There are four types of components: - - Preprocessor: do things to the data before model training - - Trainer: train a model on data, resulting in a fitted model object - - Predictor: make predictions, using a fitted model object - - Postprocessor: do things to the predictions before returning +Since visualizing the results on every geography is somewhat +overwhelming, we’ll only train on a subset of 5. -**Target audiences:** +``` r +used_locations <- c("ca", "ma", "ny", "tx") +cases_deaths <- + full_join(cases, deaths, by = c("time_value", "geo_value")) |> + filter(geo_value %in% used_locations) |> + as_epi_df(as_of = as.Date("2022-01-01")) +# plotting the data as it was downloaded +cases_deaths |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free") + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` -- Basic. Has data, calls forecaster with default arguments. -- Intermediate. Wants to examine changes to the arguments, take - advantage of built in flexibility. -- Advanced. Wants to write their own forecasters. Maybe willing to build - up from some components. + -The Advanced user should find their task to be relatively easy. Examples -of these tasks are illustrated in the [vignettes and -articles](https://cmu-delphi.github.io/epipredict). +As with basically any dataset, there is some cleaning that we will need +to do to make it actually usable; we’ll use some utilities from +[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) for this. -See also the (in progress) [Forecasting -Book](https://cmu-delphi.github.io/delphi-tooling-book/). +First, to eliminate some of the noise coming from daily reporting, we do +7 day averaging over a trailing window[^1]: -## Intermediate example +``` r +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + epi_slide( + cases_7dav = mean(case_rate, na.rm = TRUE), + death_rate_7dav = mean(death_rate, na.rm = TRUE), + .window_size = 7 + ) |> + ungroup() |> + mutate(case_rate = NULL, death_rate = NULL) |> + rename(case_rate = cases_7dav, death_rate = death_rate_7dav) +``` -The package comes with some built-in historical data for illustration, -but up-to-date versions of this could be downloaded with the -[`{epidatr}` package](https://cmu-delphi.github.io/epidatr/) and -processed using -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/).[^1] +Then trimming outliers, most especially negative values: ``` r -library(epipredict) -covid_case_death_rates -#> An `epi_df` object, 20,496 x 4 with metadata: -#> * geo_type = state -#> * time_type = day -#> * as_of = 2022-05-31 12:08:25.791826 -#> -#> # A tibble: 20,496 × 4 -#> geo_value time_value case_rate death_rate -#> * -#> 1 ak 2020-12-31 35.9 0.158 -#> 2 al 2020-12-31 65.1 0.438 -#> 3 ar 2020-12-31 66.0 1.27 -#> 4 as 2020-12-31 0 0 -#> 5 az 2020-12-31 76.8 1.10 -#> 6 ca 2020-12-31 96.0 0.751 -#> 7 co 2020-12-31 35.8 0.649 -#> 8 ct 2020-12-31 52.1 0.819 -#> 9 dc 2020-12-31 31.0 0.601 -#> 10 de 2020-12-31 65.2 0.807 -#> # ℹ 20,486 more rows +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + mutate( + outlr_death_rate = detect_outlr_rm( + time_value, death_rate, + detect_negatives = TRUE + ), + outlr_case_rate = detect_outlr_rm( + time_value, case_rate, + detect_negatives = TRUE + ) + ) |> + unnest(cols = starts_with("outlr"), names_sep = "_") |> + ungroup() |> + mutate( + death_rate = outlr_death_rate_replacement, + case_rate = outlr_case_rate_replacement + ) |> + select(geo_value, time_value, case_rate, death_rate) ``` -To create and train a simple auto-regressive forecaster to predict the -death rate two weeks into the future using past (lagged) deaths and -cases, we could use the following function. +
+ +After having downloaded and cleaned the data in `cases_deaths`, we plot +a subset of the states, noting the actual forecast date: + +
+ +Plot + ``` r -two_week_ahead <- arx_forecaster( - covid_case_death_rates, +forecast_date_label <- + tibble( + geo_value = rep(used_locations, 2), + .response_name = c(rep("case_rate", 4), rep("death_rate", 4)), + dates = rep(forecast_date - 7 * 2, 2 * length(used_locations)), + heights = c(rep(150, 4), rep(0.75, 4)) + ) +processed_data_plot <- + covid_case_death_rates |> filter(geo_value %in% used_locations) |> + autoplot( + case_rate, + death_rate, + .color_by = "none" + ) + + facet_grid( + rows = vars(.response_name), + cols = vars(geo_value), + scale = "free") + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label, + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +``` + +
+ + + +To make a forecast, we will use a “canned” simple auto-regressive +forecaster to predict the death rate four weeks into the future using +lagged[^2] deaths and cases + +``` r +four_week_ahead <- arx_forecaster( + cases_deaths |> filter(time_value <= forecast_date), outcome = "death_rate", predictors = c("case_rate", "death_rate"), args_list = arx_args_list( lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), - ahead = 14 + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) ) ) -two_week_ahead -#> ══ A basic forecaster of type ARX Forecaster ═══════════════════════════════ +four_week_ahead +#> ══ A basic forecaster of type ARX Forecaster ════════════════════════════════ #> -#> This forecaster was fit on 2024-11-11 11:38:31. +#> This forecaster was fit on 2025-01-31 10:46:32. #> #> Training data was an with: #> • Geography: state, #> • Time type: day, -#> • Using data up-to-date as of: 2022-05-31 12:08:25. +#> • Using data up-to-date as of: 2022-01-01. +#> • With the last data available on 2021-08-01 #> -#> ── Predictions ───────────────────────────────────────────────────────────── +#> ── Predictions ────────────────────────────────────────────────────────────── #> -#> A total of 56 predictions are available for -#> • 56 unique geographic regions, -#> • At forecast date: 2021-12-31, -#> • For target date: 2022-01-14. +#> A total of 4 predictions are available for +#> • 4 unique geographic regions, +#> • At forecast date: 2021-08-01, +#> • For target date: 2021-08-29, #> ``` -In this case, we have used a number of different lags for the case rate, -while only using 3 weekly lags for the death rate (as predictors). The -result is both a fitted model object which could be used any time in the -future to create different forecasts, as well as a set of predicted -values (and prediction intervals) for each location 14 days after the -last available time value in the data. +In this case, we have used 0-3 days, a week, and two week lags for the +case rate, while using only zero, one and two weekly lags for the death +rate (as predictors). The result `four_week_ahead` is both a fitted +model object which could be used any time in the future to create +different forecasts, as well as a set of predicted values (and +prediction intervals) for each location 28 days after the forecast date. +Plotting the prediction intervals on our subset above[^3]: + +
+ +Plot + ``` r -two_week_ahead$epi_workflow -#> -#> ══ Epi Workflow [trained] ══════════════════════════════════════════════════ -#> Preprocessor: Recipe -#> Model: linear_reg() -#> Postprocessor: Frosting -#> -#> ── Preprocessor ──────────────────────────────────────────────────────────── -#> -#> 6 Recipe steps. -#> 1. step_epi_lag() -#> 2. step_epi_lag() -#> 3. step_epi_ahead() -#> 4. step_naomit() -#> 5. step_naomit() -#> 6. step_training_window() -#> -#> ── Model ─────────────────────────────────────────────────────────────────── -#> -#> Call: -#> stats::lm(formula = ..y ~ ., data = data) -#> -#> Coefficients: -#> (Intercept) lag_0_case_rate lag_1_case_rate lag_2_case_rate -#> -0.0073358 0.0030365 0.0012467 0.0009536 -#> lag_3_case_rate lag_7_case_rate lag_14_case_rate lag_0_death_rate -#> 0.0011425 0.0012481 0.0003041 0.1351769 -#> lag_7_death_rate lag_14_death_rate -#> 0.1471127 0.1062473 -#> -#> ── Postprocessor ─────────────────────────────────────────────────────────── -#> -#> 5 Frosting layers. -#> 1. layer_predict() -#> 2. layer_residual_quantiles() -#> 3. layer_add_forecast_date() -#> 4. layer_add_target_date() -#> 5. layer_threshold() -#> +epiworkflow <- four_week_ahead$epi_workflow +restricted_predictions <- + four_week_ahead$predictions |> + rename(time_value = target_date, value = .pred) |> + mutate(.response_name = "death_rate") +forecast_plot <- + four_week_ahead |> + autoplot(plot_data = cases_deaths) + + geom_vline(aes(xintercept = forecast_date)) + + geom_text( + data = forecast_date_label %>% filter(.response_name == "death_rate"), + aes(x = dates, label = "forecast\ndate", y = heights), + size = 3, hjust = "right" + ) + + scale_x_date(date_breaks = "3 months", date_labels = "%Y %b") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` -The fitted model here involved preprocessing the data to appropriately -generate lagged predictors, estimating a linear model with `stats::lm()` -and then postprocessing the results to be meaningful for epidemiological -tasks. We can also examine the predictions. +
+ + + +And as a tibble of quantile level-value pairs: ``` r -two_week_ahead$predictions -#> # A tibble: 56 × 5 -#> geo_value .pred .pred_distn forecast_date target_date -#> -#> 1 ak 0.449 quantiles(0.45)[2] 2021-12-31 2022-01-14 -#> 2 al 0.574 quantiles(0.57)[2] 2021-12-31 2022-01-14 -#> 3 ar 0.673 quantiles(0.67)[2] 2021-12-31 2022-01-14 -#> 4 as 0 quantiles(0.12)[2] 2021-12-31 2022-01-14 -#> 5 az 0.679 quantiles(0.68)[2] 2021-12-31 2022-01-14 -#> 6 ca 0.575 quantiles(0.57)[2] 2021-12-31 2022-01-14 -#> 7 co 0.862 quantiles(0.86)[2] 2021-12-31 2022-01-14 -#> 8 ct 1.07 quantiles(1.07)[2] 2021-12-31 2022-01-14 -#> 9 dc 2.12 quantiles(2.12)[2] 2021-12-31 2022-01-14 -#> 10 de 1.09 quantiles(1.09)[2] 2021-12-31 2022-01-14 -#> # ℹ 46 more rows +four_week_ahead$predictions |> + select(-.pred) |> + pivot_quantiles_longer(.pred_distn) +#> # A tibble: 20 × 5 +#> geo_value values quantile_levels forecast_date target_date +#> +#> 1 ca 0.199 0.1 2021-08-01 2021-08-29 +#> 2 ca 0.285 0.25 2021-08-01 2021-08-29 +#> 3 ca 0.345 0.5 2021-08-01 2021-08-29 +#> 4 ca 0.405 0.75 2021-08-01 2021-08-29 +#> 5 ca 0.491 0.9 2021-08-01 2021-08-29 +#> 6 ma 0.0285 0.1 2021-08-01 2021-08-29 +#> # ℹ 14 more rows ``` -The results above show a distributional forecast produced using data -through the end of 2021 for the 14th of January 2022. A prediction for -the death rate per 100K inhabitants is available for every state -(`geo_value`) along with a 90% predictive interval. +The black dot gives the median prediction, while the blue intervals give +the 25-75%, the 10-90%, and 2.5-97.5% inter-quantile ranges[^4]. For +this particular day and these locations, the forecasts are relatively +accurate, with the true data being at least within the 10-90% interval. +A couple of things to note: + +1. Our methods are primarily direct forecasters; this means we don’t + need to predict 1, 2,…, 27 days ahead to then predict 28 days ahead +2. All of our existing engines are geo-pooled, meaning the training + data is shared across geographies. This has the advantage of + increasing the amount of available training data, with the + restriction that the data needs to be on comparable scales, such as + rates. + +## Getting Help + +If you encounter a bug or have a feature request, feel free to file an +[issue on our github +page](https://github.com/cmu-delphi/epipredict/issues). For other +questions, feel free to reach out to the authors, either via this +[contact +form](https://docs.google.com/forms/d/e/1FAIpQLScqgT1fKZr5VWBfsaSp-DNaN03aV6EoZU4YljIzHJ1Wl_zmtg/viewform), +email, or the Insightnet slack. + +[^1]: This makes it so that any given day of the processed time-series + only depends on the previous week, which means that we avoid leaking + future values when making a forecast. + +[^2]: lagged by 3 in this context meaning using the value from 3 days + ago. + +[^3]: Alternatively, you could call `autoplot(four_week_ahead)` to get + the full collection of forecasts. This is too busy for the space we + have for plotting here. -[^1]: Other epidemiological signals for non-Covid related illnesses are - also available with - [`{epidatr}`](https://github.com/cmu-delphi/epidatr) which - interfaces directly to Delphi’s [Epidata - API](https://cmu-delphi.github.io/delphi-epidata/) +[^4]: Note that these are not the same quantiles that we fit when + creating `four_week_ahead`. They are extrapolated from those + quantiles using `extrapolate_quantiles()` (which assumes an + exponential decay in the tails). diff --git a/_pkgdown.yml b/_pkgdown.yml index 86b4d16ad..8ecd8facb 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -15,6 +15,7 @@ articles: - backtesting - arx-classifier - update + - guts - title: Advanced methods contents: - articles/smooth-qr @@ -47,6 +48,7 @@ reference: contents: - contains("args_list") - contains("_epi_workflow") + - title: Helper functions for Hub submission contents: - flusight_hub_formatter diff --git a/inst/pkgdown-watch.R b/inst/pkgdown-watch.R new file mode 100644 index 000000000..6541a1365 --- /dev/null +++ b/inst/pkgdown-watch.R @@ -0,0 +1,64 @@ +# Run with: Rscript pkgdown-watch.R +# +# Modifying this: https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621 +# - Removed docopts cli interface and various configs/features I didn't need. +# - Sped up reference building by not running examples. +# +# Note that the `pattern` regex is case sensitive, so make sure your Rmd files +# end in `.Rmd` and not `.rmd`. +# +# Also I had issues with `pkgdown::build_reference()` not working, so I just run +# it manually when I need to. + +rlang::check_installed(c("pkgdown", "servr", "devtools", "here", "cli", "fs")) +library(pkgdown) +pkg <- pkgdown::as_pkgdown(here::here()) +devtools::build_readme() +pkgdown::build_articles(pkg) +pkgdown::build_site(pkg, lazy = FALSE, examples = FALSE, devel = TRUE, preview = FALSE) + +servr::httw( + dir = here::here("docs"), + watch = here::here(), + pattern = "[.](Rm?d|y?ml|s[ac]ss|css|js)$", + handler = function(files) { + devtools::load_all() + + files_rel <- fs::path_rel(files, start = getwd()) + cli::cli_inform("{cli::col_yellow('Updated')} {.val {files_rel}}") + + articles <- grep("vignettes.+Rmd$", files, value = TRUE) + + if (length(articles) == 1) { + name <- fs::path_ext_remove(fs::path_rel(articles, fs::path(pkg$src_path, "vignettes"))) + pkgdown::build_article(name, pkg) + } else if (length(articles) > 1) { + pkgdown::build_articles(pkg, preview = FALSE) + } + + refs <- grep("man.+R(m?d)?$", files, value = TRUE) + if (length(refs)) { + # Doesn't work for me, so I run it manually. + # pkgdown::build_reference(pkg, preview = FALSE, examples = FALSE, lazy = FALSE) # nolint: commented_code_linter + } + + pkgdown <- grep("pkgdown", files, value = TRUE) + if (length(pkgdown) && !pkgdown %in% c(articles, refs)) { + pkgdown::init_site(pkg) + } + + pkgdown_index <- grep("index[.]Rmd$", files_rel, value = TRUE) + if (length(pkgdown_index)) { + devtools::build_rmd(pkgdown_index) + pkgdown::build_home(pkg) + } + + readme <- grep("README[.]rmd$", files, value = TRUE, ignore.case = TRUE) + if (length(readme)) { + devtools::build_readme() + pkgdown::build_site(pkg, lazy = TRUE, examples = FALSE, devel = TRUE, preview = FALSE) + } + + cli::cli_alert("Site rebuild done!") + } +) diff --git a/man/figures/README-case_death-1.png b/man/figures/README-case_death-1.png new file mode 100644 index 000000000..f1b4ed98e Binary files /dev/null and b/man/figures/README-case_death-1.png differ diff --git a/man/figures/README-date-1.png b/man/figures/README-date-1.png new file mode 100644 index 000000000..e300bd3e5 Binary files /dev/null and b/man/figures/README-date-1.png differ diff --git a/man/figures/README-show-processed-data-1.png b/man/figures/README-show-processed-data-1.png new file mode 100644 index 000000000..04583bd24 Binary files /dev/null and b/man/figures/README-show-processed-data-1.png differ diff --git a/man/figures/README-show-single-forecast-1.png b/man/figures/README-show-single-forecast-1.png new file mode 100644 index 000000000..0acd24d18 Binary files /dev/null and b/man/figures/README-show-single-forecast-1.png differ diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 7b1f320e4..84b14d0fb 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -1,6 +1,6 @@ library(dplyr) # Test ideas that were dropped: -# - "epi_adjust_latency works correctly when there's gaps in the timeseries" +# - "epi_adjust_latency works correctly when there's gaps in the time-series" # - "epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked" # - "`step_adjust_latency` only allows one instance of itself" # - "data with epi_df shorn off works" diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index 2cf7037c7..d68b7d63e 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -11,459 +11,417 @@ vignette: > source("_common.R") ``` -```{r setup, message=FALSE} +```{r setup, message=FALSE, include = FALSE} library(dplyr) library(parsnip) library(workflows) library(recipes) +library(epidatasets) library(epipredict) +library(ggplot2) +forecast_date <- as.Date("2021-08-01") +used_locations <- c("ca", "ma", "ny", "tx") +library(epidatr) ``` - -# Goals for the package - -At a high level, our goal with `{epipredict}` is to make running simple Machine -Learning / Statistical forecasters for epidemiology easy. However, this package -is extremely extensible, and that is part of its utility. Our hope is that it is -easy for users with epi training and some statistics to fit baseline models -while still allowing those with more nuanced statistical understanding to create -complicated specializations using the same framework. - -Serving both populations is the main motivation for our efforts, but at the same -time, we have tried hard to make it useful. - - -## Baseline models - -We provide a set of basic, easy-to-use forecasters that work out of the box. You -should be able to do a reasonably limited amount of customization on them. Any -serious customization happens with the framework discussed below). - -For the basic forecasters, we provide: - -* Baseline flat-line forecaster -* Autoregressive forecaster -* Autoregressive classifier - -All the forcasters we provide are built on our framework. So we will use these -basic models to illustrate its flexibility. - -## Forecasting framework - -Our framework for creating custom forecasters views the prediction task as a set -of modular components. There are four types of components: - -1. Preprocessor: make transformations to the data before model training -2. Trainer: train a model on data, resulting in a fitted model object -3. Predictor: make predictions, using a fitted model object and processed test data -4. Postprocessor: manipulate or transform the predictions before returning - -Users familiar with [`{tidymodels}`](https://www.tidymodels.org) and especially -the [`{workflows}`](https://workflows.tidymodels.org) package will notice a lot -of overlap. This is by design, and is in fact a feature. The truth is that -`{epipredict}` is a wrapper around much that is contained in these packages. -Therefore, if you want something from this -verse, it should "just work" (we -hope). - -The reason for the overlap is that `{workflows}` *already implements* the first -three steps. And it does this very well. However, it is missing the -postprocessing stage and currently has no plans for such an implementation. And -this feature is important. The baseline forecaster we provide *requires* -postprocessing. Anything more complicated needs this as well. - -The second omission from `{tidymodels}` is support for panel data. Besides -epidemiological data, economics, psychology, sociology, and many other areas -frequently deal with data of this type. So the framework of behind -`{epipredict}` implements this. In principle, this has nothing to do with -epidemiology, and one could simply use this package as a solution for the -missing functionality in `{tidymodels}`. Again, this should "just work". - -All of the *panel data* functionality is implemented through the `epi_df` data -type in the companion [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) -package. There is much more to see there, but for the moment, it's enough to -look at a simple one: - -```{r epidf} -jhu <- covid_case_death_rates -jhu +At a high level, the goal of `{epipredict}` is to make running simple machine +learning / statistical forecasters for epidemiology easy. +To do this, we have extended several [tidymodels](https://www.tidymodels.org/) +packages to handle the case of panel time-series data. +Our hope is that it is easy for users with epi training and some statistics to +fit baseline models while still allowing those with more nuanced statistical +understanding to create complicated specializations using the same framework. +Towards that end, epipredict provides two main classes of tools: + +1. A set of basic, easy-to-use "canned" forecasters that work out of the box. + For the basic forecasters, we currently provide: + * Flatline forecaster: predicts a median that is the last value + with increasingly wide quantiles. + * Autoregressive forecaster: fits a model (e.g. linear regression) on + lagged data to predict quantiles for continuous values. + * Autoregressive classifier: fits a model (e.g. logistic regression) on + lagged data to predict a binned version of the growth rate. + * CDC FluSight flatline forecaster: a variant of the flatline forecaster as + used by the CDC in FluSight. +2. A framework for creating custom forecasters out of modular components, from + which the canned forecasters were created. There are three types of + components: + * Preprocessor: do things to the data before model training, such as convert + counts to rates, create smoothed columns, or [any of the recipes + steps](https://recipes.tidymodels.org/reference/index.html) + * Trainer: train a model on data, resulting in a fitted model object. + Examples include linear regression, quantile regression, or [any parsnip + engine](https://parsnip.tidymodels.org/). + * Postprocessor: unique to this package, and used to do things to the + predictions after the model has been fit, such as + - generate quantiles from purely point-prediction models, + - undo operations done in the steps, such as convert back to counts from + rates + - generally adapt the format of the prediction to it's eventual use. + +The rest of the getting started will focus on using and modifying the canned forecasters. +If you need a more complicated model, check out the [Guts +vignette](preprocessing-and-models) for examples of using the forecaster +framework. + +If you are interested in time series in a non-panel data context, you may also +want to look at `{timetk}` and `{modeltime}` for some related techniques. + +For a more in-depth treatment with some practical applications, see also the +[Forecasting Book](https://cmu-delphi.github.io/delphi-tooling-book/). + +# Panel forecasting basics +## Example data + +The forecasting methods in this package are designed to work with panel time +series data, specifically in the form of an `epi_df` from the `{epiprocess}` +package. +This is a collection of one or more time-series indexed by one or more +categorical variables. +For example, on the landing page: + +```{r data_ex} +covid_case_death_rates ``` -This data is built into the package and contains the measured variables -`case_rate` and `death_rate` for COVID-19 at the daily level for each US state -for the year 2021. The "panel" part is because we have repeated measurements -across a number of locations. - -The `epi_df` encodes the time stamp as `time_value` and the `key` as -`geo_value`. While these 2 names are required, the values don't need to actually -represent such objects. Additional `key`'s are also supported (like age group, -ethnicity, taxonomy, etc.). - -The `epi_df` also contains some metadata that describes the keys as well as the -vintage of the data. It's possible that data collected at different times for -the *same set* of `geo_value`'s and `time_value`'s could actually be different. -For more details, see -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html). - -## Why doesn't this package already exist? - -As described above: +`geo_value` is the only key for this dataset, while there are two separate +time-series, `case_rate` and `death_rate`. +The keys are represented in "long" format (so separate columns for the key and +the value), while separate time series are represented in "wide" format (so each +time-series has a separate column). -* Parts actually DO exist. There's a universe called `{tidymodels}`. It handles -preprocessing, training, and prediction, bound together, through a package called -`{workflows}`. We built `{epipredict}` on top of that setup. In this way, you CAN -use almost everything they provide. +`{epiprocess}` is designed to handle data that always has a geographic key, and +potentially other key values, such as age, ethnicity or other demographic +information. +For example, `grad_employ_subset` from `{epidatasets}` also has both `age_group` +and `edu_qual` as additional keys: -* However, `{workflows}` doesn't do postprocessing. And nothing in the -verse -handles _panel data_. - -* The tidy-team doesn't have plans to do either of these things. (We checked). - -* There are two packages that do _time series_ built on `{tidymodels}`, but it's -"basic" time series: 1-step AR models, exponential smoothing, STL decomposition, -etc.[^2] Our group has not prioritized these sorts of models for epidemic -forecasting, but one could also integrate these methods into our framework. - -[^2]: These are [`{timetk}`](https://business-science.github.io/timetk/index.html) -and [`{modeltime}`](https://business-science.github.io/timetk/index.html). There -are *lots* of useful methods there than can be used to do fairly complex machine -learning methodology, though not directly for panel data and not for direct -prediction of future targets. - -# Show me the basics - -We start with the `jhu` data displayed above. One of the "canned" forecasters we -provide is an autoregressive forecaster with (or without) covariates that -*directly* trains on the response. This is in contrast to a typical "iterative" -AR model that trains to predict one-step-ahead, and then plugs in the -predictions to "leverage up" to longer horizons. - -We'll estimate the model jointly across all locations using only the most -recent 30 days. - -```{r demo-workflow} -jhu <- jhu %>% filter(time_value >= max(time_value) - 30) -out <- arx_forecaster( - jhu, - outcome = "death_rate", - predictors = c("case_rate", "death_rate") -) -``` - -The `out` object has two components: - - 1. The predictions which is just another `epi_df`. It contains the predictions for -each location along with additional columns. By default, these are a 90% -predictive interval, the `forecast_date` (the date on which the forecast was -putatively made) and the `target_date` (the date for which the forecast is being -made). - ```{r} -out$predictions - ``` - 2. A list object of class `epi_workflow`. This object encapsulates all the -instructions necessary to create the prediction. More details on this below. - ```{r} -out$epi_workflow - ``` - -By default, the forecaster predicts the outcome (`death_rate`) 1-week ahead, -using 3 lags of each predictor (`case_rate` and `death_rate`) at 0 (today), 1 -week back and 2 weeks back. The predictors and outcome can be changed directly. -The rest of the defaults are encapsulated into a list of arguments. This list is -produced by `arx_args_list()`. - -## Simple adjustments - -Basic adjustments can be made through the `args_list`. - -```{r kill-warnings, echo=FALSE} -knitr::opts_chunk$set(warning = FALSE, message = FALSE) +```{r extra_keys} +grad_employ_subset ``` -```{r differential-lags} -out2week <- arx_forecaster( - jhu, +See `{epiprocess}` for more details +on the format. + +Panel time series are ubiquitous in epidemiology, but are also common in +economics, psychology, sociology, and many other areas. +While this package was designed with epidemiology in mind, many of the +techniques are more broadly applicable. + +## Customizing `arx_forecaster()` +Moving on from the example on the [landing +page](../index.html#motivating-example), let's adjust some parameters for +`arx_forecaster()`. +`trainer` allows us to set a different fitting engines, either one of the +included ones, such as `quantile_reg()`, or one of the relevant [parsnip +models](https://www.tidymodels.org/find/parsnip/): + +```{r make-forecasts, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), outcome = "death_rate", - predictors = c("case_rate", "death_rate"), + trainer = quantile_reg(), + predictors = c("death_rate"), args_list = arx_args_list( - lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + lags = list(c(0, 7, 14)), ahead = 14 ) ) +hardhat::extract_fit_engine(two_week_ahead$epi_workflow) ``` -Here, we've used different lags on the `case_rate` and are now predicting 2 -weeks ahead. This example also illustrates a major difficulty with the -"iterative" versions of AR models. This model doesn't produce forecasts for -`case_rate`, and so, would not have data to "plug in" for the necessary -lags.[^1] - -[^1]: An obvious fix is to instead use a VAR and predict both, but this would -likely increase the variance of the model, and therefore, may lead to less -accurate forecasts for the variable of interest. +The default trainer is `parsnip::linear_reg()`, which generates quantiles after +the fact in the post-processing layers, rather than as part of the model. +`quantile_reg()` on the other hand directly estimates different linear models +for each quantile, reflected in the 3 different columns for `tau` above. -Another property of the basic model is the predictive interval. We describe this -in more detail in a different vignette, but it is easy to request multiple -quantiles. +Because of the flexibility of `{parsnip}`, there are a whole host of models +available to us[^5]; as an example, we could have just as easily substituted a +non-linear random forest model from `{ranger}`: -```{r differential-levels} -out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), +```{r rand_forest_ex, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), + outcome = "death_rate", + trainer = rand_forest(mode = "regression"), + predictors = c("death_rate"), args_list = arx_args_list( - quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99) + lags = list(c(0, 7, 14)), + ahead = 14 ) ) ``` -The column `.pred_dstn` in the `predictions` object is actually a "distribution" -here parameterized by its quantiles. For this default forecaster, these are -created using the quantiles of the residuals of the predictive model (possibly -symmetrized). Here, we used 23 quantiles, but one can grab a particular -quantile, +Any other customization is routed through `arx_args_list()`; for example, if we +wanted to increase the number of quantiles fit: -```{r q1} -round(head(quantile(out_q$predictions$.pred_distn, p = .4)), 3) +```{r make-quantile-levels-forecasts, warning=FALSE} +two_week_ahead <- arx_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + trainer = quantile_reg(), + predictors = c("death_rate"), + args_list = arx_args_list( + lags = list(c(0, 7, 14)), + ahead = 14, + ############ changing quantile_levels ############ + quantile_levels = c(0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95) + ################################################## + ) +) +hardhat::extract_fit_engine(two_week_ahead$epi_workflow) ``` -or extract the entire distribution into a "long" `epi_df` with `quantile_levels` -being the probability and `values` being the value associated to that quantile. - -```{r q2} -out_q$predictions %>% - pivot_quantiles_longer(.pred_distn) +See the function documentation of `arx_args_list()` for more examples of the modifications available. +If you are looking for even further modifications, you will want a custom +workflow, for which you should see the [guts +vignette](preprocessing-and-models). + +## Generating multiple aheads +Frequently, one doesn't want just a forecast for a single day, but a trajectory +of forecasts for several weeks. +The way to do this using `arx_forecaster()` is by looping over aheads; for +example, to predict every day over 4 weeks: + +```{r temp-thing} +all_canned_results <- lapply( + seq(0, 28), + \(days_ahead) { + arx_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = days_ahead + ) + ) + } +) +# pull out the workflow and the predictions to be able to +# effectively use autoplot +workflow <- all_canned_results[[1]]$epi_workflow +results <- purrr::map_df(all_canned_results, ~ `$`(., "predictions")) +autoplot( + object = workflow, + predictions = results, + plot_data = covid_case_death_rates |> + filter(geo_value %in% used_locations, time_value > "2021-07-01") +) ``` -Additional simple adjustments to the basic forecaster can be made using the -function: - -```{r, eval = FALSE} -arx_args_list( - lags = c(0L, 7L, 14L), ahead = 7L, n_training = Inf, - forecast_date = NULL, target_date = NULL, quantile_levels = c(0.05, 0.95), - symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf +## Other canned forecasters +### `flatline_forecaster()` +The simplest model we provide is the `flatline_forecaster()`, which predicts a +flat line (with quantiles generated from the residuals using +`layer_residual_quantiles()`). +For example, on the same dataset as above: +```{r make-flatline-forecast, warning=FALSE} +all_flatlines <- lapply( + seq(0, 28), + \(days_ahead) { + flatline_forecaster( + covid_case_death_rates |> + filter(time_value <= forecast_date, geo_value %in% used_locations), + outcome = "death_rate", + args_list = flatline_args_list(ahead = days_ahead) + ) + } +) +# same plotting code as in the arx multi-ahead case +workflow <- all_flatlines[[1]]$epi_workflow +results <- purrr::map_df(all_flatlines, ~ `$`(., "predictions")) +autoplot( + object = workflow, + predictions = results, + plot_data = covid_case_death_rates |> filter(geo_value %in% used_locations, time_value > "2021-07-01") ) ``` +_**the problem is only occurring in the plot, not the underlying prediction**_ + +Note that the `cdc_baseline_forecaster` is a slight modification of this method +for use in [the CDC COVID19 Forecasting Hub](https://covid19forecasthub.org/). -## Changing the engine +### `arx_classifier()` -So far, our forecasts have been produced using simple linear regression. But -this is not the only way to estimate such a model. The `trainer` argument -determines the type of model we want. This takes a -[`{parsnip}`](https://parsnip.tidymodels.org) model. The default is linear -regression, but we could instead use a random forest with the `{ranger}` -package: +The most complicated of the canned forecasters, `arx_classifier` first +translates the outcome into a growth rate, and then classifies that growth rate +into bins. +For example, on the same dataset and `forecast_date` as above, we get: -```{r ranger, warning = FALSE} -out_rf <- arx_forecaster( - jhu, +```{r discrete-rt} +classifier <- arx_classifier( + covid_case_death_rates |> + filter(geo_value %in% used_locations, time_value < forecast_date), outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = rand_forest(mode = "regression") + predictors = c("death_rate", "case_rate"), + trainer = multinom_reg(), + args_list = arx_class_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = 2 * 7, + breaks = c(-0.01, 0.01, 0.1) + ) ) +classifier$predictions ``` -Or boosted regression trees with `{xgboost}`: - -```{r xgboost, warning = FALSE} -out_gb <- arx_forecaster( - jhu, - outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = boost_tree(mode = "regression", trees = 20) -) +The prediction splits into 4 cases: `(-∞, -0.01)`, `(-0.01, 0.01)`, `(0.01, +0.1)`, and `(0.1, ∞)`. +In this case, the classifier put all 4 of the states in the same category, +`(0.01, 0.1)`. **TODO** _effected by the old data._ +The number and size of the categories is controlled by `breaks`, which gives the +boundary values. + +For comparison, the growth rates for the `target_date`, as computed using +`{epiprocess}`: + +```{r growth_rate_results} +growth_rates <- covid_case_death_rates |> + filter(geo_value %in% used_locations) |> + group_by(geo_value) |> + mutate( + deaths_gr = growth_rate(x = time_value, y = death_rate) + ) |> + ungroup() +growth_rates |> filter(time_value == "2021-08-14") ``` -Or quantile regression, using our custom forecasting engine `quantile_reg()`: +Unfortunately, this forecast was not particularly accurate, since for example +`-1.39` is not remotely in the interval `(-0.01, 0.01]`. -```{r quantreg, warning = FALSE} -out_qr <- arx_forecaster( - jhu, - outcome = "death_rate", - predictors = c("case_rate", "death_rate"), - trainer = quantile_reg() -) -``` -FWIW, this last case (using quantile regression), is not far from what the -Delphi production forecast team used for its Covid forecasts over the past few -years. - -## Inner workings - -Underneath the hood, this forecaster creates (and returns) an `epi_workflow`. -Essentially, this is a big S3 object that wraps up the 4 modular steps -(preprocessing - postprocessing) described above. - -### Preprocessing - -Preprocessing is accomplished through a `recipe` (imagine baking a cake) as -provided in the [`{recipes}`](https://recipes.tidymodels.org) package. -We've made a few modifications (to handle -panel data) as well as added some additional options. The recipe gives a -specification of how to handle training data. Think of it like a fancified -`formula` that you would pass to `lm()`: `y ~ x1 + log(x2)`. In general, -there are 2 extensions to the `formula` that `{recipes}` handles: - - 1. Doing transformations of both training and test data that can always be - applied. These are things like taking the log of a variable, leading or - lagging, filtering out rows, handling dummy variables, etc. - 2. Using statistics from the training data to eventually process test data. - This is a major benefit of `{recipes}`. It prevents what the tidy team calls - "data leakage". A simple example is centering a predictor by its mean. We - need to store the mean of the predictor from the training data and use that - value on the test data rather than accidentally calculating the mean of - the test predictor for centering. - -A recipe is processed in 2 steps, first it is "prepped". This calculates and -stores any intermediate statistics necessary for use on the test data. -Then it is "baked" -resulting in training data ready for passing into a statistical model (like `lm`). - -We have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle -the `time_value`, `geo_value`, and any additional keys so that these are available -when necessary. - -The `epi_recipe` from `out_gb` can be extracted from the result: - -```{r} -extract_recipe(out_gb$epi_workflow) -``` +## Fitting multi-key panel data -The "Inputs" are the original `epi_df` and the "roles" that these are assigned. -None of these are predictors or outcomes. Those will be created -by the recipe when it is prepped. The "Operations" are the sequence of -instructions to create the cake (baked training data). -Here we create lagged predictors, lead the outcome, and then remove `NA`s. -Some models like `lm` internally handle `NA`s, but not everything does, so we -deal with them explicitly. The code to do this (inside the forecaster) is - -```{r} -er <- epi_recipe(jhu) %>% - step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = 7) %>% - step_epi_naomit() -``` +If you have multiple keys that are set in the `epi_df` as `other_keys`, +`arx_forecaster` will automatically group by those as well. +For example, predicting the number of graduates in each of the categories in `grad_employ` from above: -While `{recipes}` provides a function `step_lag()`, it assumes that the data -have no breaks in the sequence of `time_values`. This is a bit dangerous, so -we avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the -amount of data to avoid accidentally dropping recent predictors from the test -data. - -### The model specification - -Users with familiarity with the `{parsnip}` package will have no trouble here. -Basically, `{parsnip}` unifies the function signature across statistical models. -For example, `lm()` "likes" to work with formulas, but `glmnet::glmnet()` uses -`x` and `y` for predictors and response. `{parsnip}` is agnostic. Both of these -do "linear regression". Above we switched from `lm()` to `xgboost()` without -any issue despite the fact that these functions couldn't be more different. - -```{r, eval = FALSE} -lm(formula, data, subset, weights, na.action, - method = "qr", - model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, - contrasts = NULL, offset, ... +```{r multi_key_forecast, warning=FALSE} +# only fitting a subset, otherwise there are ~550 distinct pairs, which is bad for plotting +edu_quals <- c("Undergraduate degree", "Professional degree") +geo_values <- c("Quebec", "British Columbia") +grad_forecast <- arx_forecaster( + grad_employ_subset |> + filter(time_value < 2017) |> + filter(edu_qual %in% edu_quals, geo_value %in% geo_values), + outcome = "num_graduates", + predictors = c("num_graduates"), + args_list = arx_args_list( + lags = list(c(0, 1, 2)), + ahead = 1 + ) ) - -xgboost( - data = NULL, label = NULL, missing = NA, weight = NULL, - params = list(), nrounds, verbose = 1, print_every_n = 1L, - early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, - save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), - ... +# and plotting +autoplot( + grad_forecast$epi_workflow, + grad_forecast$predictions, + grad_employ_subset |> + filter(edu_qual %in% edu_quals, geo_value %in% geo_values), ) ``` -`{epipredict}` provides a few engines/modules (the flatline forecaster and -quantile regression), but you should be able to use any available models -listed [here](https://www.tidymodels.org/find/parsnip/). +The 8 graphs are all pairs of the `geo_values` (`"Quebec"` and `"British Columbia"`), `edu_quals` (`"Undergraduate degree"` and `"Professional degree"`), and age brackets (`"15 to 34 years"` and `"35 to 64 years"`). -To estimate (fit) a preprocessed model, one calls `fit()` on the `epi_workflow`. +# Anatomy of a canned forecaster +## Code object +Let's dissect the forecaster we trained back on the [landing +page](../index.html#motivating-example): -```{r} -ewf <- epi_workflow(er, linear_reg()) %>% fit(jhu) +```{r make-four-forecasts, warning=FALSE} +four_week_ahead <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0, 1, 2, 3, 7, 14), c(0, 7, 14)), + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) +) ``` -### Postprocessing - -To stretch the metaphor of preparing a cake to its natural limits, we have -created postprocessing functionality called "frosting". Much like the recipe, -each postprocessing operation is a "layer" and we "slather" these onto our -baked cake. To fix ideas, below is the postprocessing `frosting` for -`arx_forecaster()` +`four_week_ahead` has two components: an `epi_workflow`, and a table of +`predictions`. +The table of predictions is simply a tibble of the predictions, -```{r} -extract_frosting(out_q$epi_workflow) +```{r show_predictions} +four_week_ahead$predictions ``` -Here we have 5 layers of frosting. The first generates the forecasts from the test data. -The second uses quantiles of the residuals to create distributional -forecasts. The next two add columns for the date the forecast was made and the -date for which it is intended to occur. Because we are predicting rates, they -should be non-negative, so the last layer thresholds both predicted values and -intervals at 0. The code to do this (inside the forecaster) is - -```{r} -f <- frosting() %>% - layer_predict() %>% - layer_residual_quantiles( - quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99), - symmetrize = TRUE - ) %>% - layer_add_forecast_date() %>% - layer_add_target_date() %>% - layer_threshold(starts_with(".pred")) -``` +`.pred` gives the point/median prediction, while `.pred_distn` is a +`dist_quantiles()` object representing a distribution through various quantile +levels. +The `[6]` in the name refers to the number of quantiles that have been +explicitly created[^4]; by default, this covers a 90% prediction interval, or 5% +and 95%. -At predict time, we add this object onto the `epi_workflow` and call `forecast()` +The `epi_workflow` is a significantly more complicated object, extending a +`workflows::workflow()` to include post-processing: -```{r, warning=FALSE} -ewf %>% - add_frosting(f) %>% - forecast() +```{r show_workflow} +four_week_ahead$epi_workflow ``` -The above `get_test_data()` function examines the recipe and ensures that enough -test data is available to create the necessary lags and produce a prediction -for the desired future time point (after the end of the training data). This mimics -what would happen if `jhu` contained the most recent available historical data and -we wanted to actually predict the future. We could have instead used any test data -that contained the necessary predictors. - - -## Conclusion - -Internally, we provide some simple functions to create reasonable forecasts. -But ideally, a user could create their own forecasters by building up the -components we provide. In other vignettes, we try to walk through some of these -customizations. - -To illustrate everything above, here is (roughly) the code for the -`flatline_forecaster()` applied to the `case_rate`. +An `epi_workflow()` consists of 3 parts: + +- `Preprocessor`: a collection of steps that transform the data to be ready for + modelling. They come from this package or [any of the recipes + steps](https://recipes.tidymodels.org/reference/index.html); + `four_week_ahead` has 5 of these, and you can inspect them more closely by + running `hardhat::extract_recipe(four_week_ahead$epi_workflow)`. +- `Model`: the actual model that does the fitting, given by a + `parsnip::model_spec`; `four_week_ahead` has the default of + `parsnip::linear_reg()`, which is a wrapper from `{parsnip}` for + `stats::lm()`. You can inspect the model more closely by running + `hardhat::extract_fit_recipe(four_week_ahead$epi_workflow)`. +- `Postprocessor`: a collection of layers to be applied to the resulting + forecast, internal to this package. `four_week_ahead` just so happens to have + 5 of as these well. You can inspect the layers more closely by running + `epipredict::extract_layers(four_week_ahead$epi_workflow)`. + +See the [Guts vignette](preprocessing-and-models) for recreating and then +extending `four_week_ahead` using the custom forecaster framework. + +## Mathematical description + +Let's describe in more detail the actual fit model for a more minimal version of +`four_week_ahead`: + +```{r, four_week_again} +four_week_small <- arx_forecaster( + covid_case_death_rates |> filter(time_value <= forecast_date), + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + args_list = arx_args_list( + lags = list(c(0, 7, 14), c(0, 7, 14)), + ahead = 4 * 7, + quantile_levels = c(0.1, 0.25, 0.5, 0.75, 0.9) + ) +) +hardhat::extract_fit_engine(four_week_small$epi_workflow) +``` -```{r} -r <- epi_recipe(jhu) %>% - step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>% - update_role(case_rate, new_role = "predictor") %>% - add_role(all_of(key_colnames(jhu)), new_role = "predictor") +If $d_t$ is the death rate on day $t$ and $c_t$ is the case rate, then the model +we're fitting is: -f <- frosting() %>% - layer_predict() %>% - layer_residual_quantiles() %>% - layer_add_forecast_date() %>% - layer_add_target_date() %>% - layer_threshold(starts_with(".pred")) +$$ +d_{t+28} = a_0 + a_1 d_t + a_2 d_{t-7} + a_3 d_{t-14} + a_4 c_t + a_5 c_{t-7} + a_6 c_{t-14}. +$$ -eng <- linear_reg() %>% set_engine("flatline") -wf <- epi_workflow(r, eng, f) %>% fit(jhu) -preds <- forecast(wf) -``` +For example, $a_1$ is `lag_0_death_rate` above, with a value of `r hardhat::extract_fit_engine(four_week_small$epi_workflow)$coefficients["lag_0_death_rate"] `, +while $a_5$ is `r hardhat::extract_fit_engine(four_week_small$epi_workflow)$coefficients["lag_7_case_rate"] `. -All that really differs from the `arx_forecaster()` is the `recipe`, the -test data, and the engine. The `frosting` is identical, as is the fitting -and predicting procedure. +The training data for fitting this linear model is created by creating a series +of columns shifted by the appropriate amount; this makes it so that each row +without `NA` values is a training point to fit the coefficients $a_0,\ldots, a_6$. -```{r} -preds -``` +[^4]: in the case of a `{parsnip}` engine which doesn't explicitly predict + quantiles, these quantiles are created using `layer_residual_quantiles()`, + which infers the quantiles from the residuals of the fit. +[^5]: in the case of `arx_forecaster`, this is any model with + `mode="regression"` from [this + list](https://www.tidymodels.org/find/parsnip/). diff --git a/vignettes/guts.Rmd b/vignettes/guts.Rmd new file mode 100644 index 000000000..f17065384 --- /dev/null +++ b/vignettes/guts.Rmd @@ -0,0 +1,270 @@ +--- +title: "Guts" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Guts} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +source("_common.R") +``` + +```{r setup, message=FALSE, include = FALSE} +library(dplyr) +library(parsnip) +library(workflows) +library(recipes) +library(epipredict) +forecast_date <- as.Date("2021-08-01") +used_locations <- c("ca", "ma", "ny", "tx") +library(epidatr) +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, case_rate = value) + +deaths <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20211231), + geo_values = "*" +) |> + select(geo_value, time_value, death_rate = value) +cases_deaths <- + full_join(cases, deaths, by = c("time_value", "geo_value")) |> + filter(geo_value %in% used_locations) |> + as_epi_df(as_of = as.Date("2022-01-01")) +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + epi_slide( + cases_7dav = mean(case_rate, na.rm = TRUE), + death_rate_7dav = mean(death_rate, na.rm = TRUE), + .window_size = 7 + ) |> + ungroup() |> + mutate(case_rate = NULL, death_rate = NULL) |> + rename(case_rate = cases_7dav, death_rate = death_rate_7dav) +cases_deaths <- + cases_deaths |> + group_by(geo_value) |> + mutate( + outlr_death_rate = detect_outlr_rm( + time_value, death_rate, + detect_negatives = TRUE + ), + outlr_case_rate = detect_outlr_rm( + time_value, case_rate, + detect_negatives = TRUE + ) + ) |> + unnest(cols = starts_with("outlr"), names_sep = "_") |> + ungroup() |> + mutate( + death_rate = outlr_death_rate_replacement, + case_rate = outlr_case_rate_replacement + ) |> + select(geo_value, time_value, case_rate, death_rate) +jhu <- covid_case_death_rates +out_gb <- arx_forecaster( + jhu, + outcome = "death_rate", + predictors = c("case_rate", "death_rate"), + trainer = boost_tree(mode = "regression", trees = 20) +) +out_q <- arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), + args_list = arx_args_list( + quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99) + ) +) +``` +# Document Title +## Inner workings + +Underneath the hood, this forecaster creates (and returns) an `epi_workflow`. +Essentially, this is a big S3 object that wraps up the 4 modular steps +(preprocessing - postprocessing) described above. + +### Preprocessing + +Preprocessing is accomplished through a `recipe` (imagine baking a cake) as +provided in the [`{recipes}`](https://recipes.tidymodels.org) package. +We've made a few modifications (to handle +panel data) as well as added some additional options. The recipe gives a +specification of how to handle training data. Think of it like a fancified +`formula` that you would pass to `lm()`: `y ~ x1 + log(x2)`. In general, +there are 2 extensions to the `formula` that `{recipes}` handles: + + 1. Doing transformations of both training and test data that can always be + applied. These are things like taking the log of a variable, leading or + lagging, filtering out rows, handling dummy variables, etc. + 2. Using statistics from the training data to eventually process test data. + This is a major benefit of `{recipes}`. It prevents what the tidy team calls + "data leakage". A simple example is centering a predictor by its mean. We + need to store the mean of the predictor from the training data and use that + value on the test data rather than accidentally calculating the mean of + the test predictor for centering. + +A recipe is processed in 2 steps, first it is "prepped". This calculates and +stores any intermediate statistics necessary for use on the test data. +Then it is "baked" +resulting in training data ready for passing into a statistical model (like `lm`). + +We have introduced an `epi_recipe`. It's just a `recipe` that knows how to handle +the `time_value`, `geo_value`, and any additional keys so that these are available +when necessary. + +The `epi_recipe` from `out_gb` can be extracted from the result: + +```{r} +extract_recipe(out_gb$epi_workflow) +``` + +The "Inputs" are the original `epi_df` and the "roles" that these are assigned. +None of these are predictors or outcomes. Those will be created +by the recipe when it is prepped. The "Operations" are the sequence of +instructions to create the cake (baked training data). +Here we create lagged predictors, lead the outcome, and then remove `NA`s. +Some models like `lm` internally handle `NA`s, but not everything does, so we +deal with them explicitly. The code to do this (inside the forecaster) is + +```{r} +er <- epi_recipe(jhu) %>% + step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_naomit() +``` + +While `{recipes}` provides a function `step_lag()`, it assumes that the data +have no breaks in the sequence of `time_values`. This is a bit dangerous, so +we avoid that behaviour. Our `lag/ahead` functions also appropriately adjust the +amount of data to avoid accidentally dropping recent predictors from the test +data. + +### The model specification + +Users with familiarity with the `{parsnip}` package will have no trouble here. +Basically, `{parsnip}` unifies the function signature across statistical models. +For example, `lm()` "likes" to work with formulas, but `glmnet::glmnet()` uses +`x` and `y` for predictors and response. `{parsnip}` is agnostic. Both of these +do "linear regression". Above we switched from `lm()` to `xgboost()` without +any issue despite the fact that these functions couldn't be more different. + +```{r, eval = FALSE} +lm(formula, data, subset, weights, na.action, + method = "qr", + model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, + contrasts = NULL, offset, ... +) + +xgboost( + data = NULL, label = NULL, missing = NA, weight = NULL, + params = list(), nrounds, verbose = 1, print_every_n = 1L, + early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, + save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), + ... +) +``` + +`{epipredict}` provides a few engines/modules (the flatline forecaster and +quantile regression), but you should be able to use any available models +listed [here](https://www.tidymodels.org/find/parsnip/). + +To estimate (fit) a preprocessed model, one calls `fit()` on the `epi_workflow`. + +```{r} +ewf <- epi_workflow(er, linear_reg()) %>% fit(jhu) +``` + +### Postprocessing + +To stretch the metaphor of preparing a cake to its natural limits, we have +created postprocessing functionality called "frosting". Much like the recipe, +each postprocessing operation is a "layer" and we "slather" these onto our +baked cake. To fix ideas, below is the postprocessing `frosting` for +`arx_forecaster()` + +```{r} +extract_frosting(out_q$epi_workflow) +``` + +Here we have 5 layers of frosting. The first generates the forecasts from the test data. +The second uses quantiles of the residuals to create distributional +forecasts. The next two add columns for the date the forecast was made and the +date for which it is intended to occur. Because we are predicting rates, they +should be non-negative, so the last layer thresholds both predicted values and +intervals at 0. The code to do this (inside the forecaster) is + +```{r} +f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles( + quantile_levels = c(.01, .025, seq(.05, .95, by = .05), .975, .99), + symmetrize = TRUE + ) %>% + layer_add_forecast_date() %>% + layer_add_target_date() %>% + layer_threshold(starts_with(".pred")) +``` + +At predict time, we add this object onto the `epi_workflow` and call `forecast()` + +```{r, warning=FALSE} +ewf %>% + add_frosting(f) %>% + forecast() +``` + +The above `get_test_data()` function examines the recipe and ensures that enough +test data is available to create the necessary lags and produce a prediction +for the desired future time point (after the end of the training data). This mimics +what would happen if `jhu` contained the most recent available historical data and +we wanted to actually predict the future. We could have instead used any test data +that contained the necessary predictors. + + +## Conclusion + +Internally, we provide some simple functions to create reasonable forecasts. +But ideally, a user could create their own forecasters by building up the +components we provide. In other vignettes, we try to walk through some of these +customizations. + +To illustrate everything above, here is (roughly) the code for the +`flatline_forecaster()` applied to the `case_rate`. + +```{r} +r <- epi_recipe(jhu) %>% + step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>% + update_role(case_rate, new_role = "predictor") %>% + add_role(all_of(key_colnames(jhu)), new_role = "predictor") + +f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_add_forecast_date() %>% + layer_add_target_date() %>% + layer_threshold(starts_with(".pred")) + +eng <- linear_reg() %>% set_engine("flatline") +wf <- epi_workflow(r, eng, f) %>% fit(jhu) +preds <- forecast(wf) +``` + +All that really differs from the `arx_forecaster()` is the `recipe`, the +test data, and the engine. The `frosting` is identical, as is the fitting +and predicting procedure. + +```{r} +preds +``` + diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd index 1faf5b56f..32da33e8e 100644 --- a/vignettes/panel-data.Rmd +++ b/vignettes/panel-data.Rmd @@ -234,9 +234,9 @@ summary(extract_fit_engine(wf_linreg)) This output tells us the coefficients of the fitted model; for instance, the estimated intercept is $\widehat{\alpha}_0 =$ -`r round(coef(extract_fit_engine(wf_linreg))[1], 3)` and the coefficient for +`r round(coef(hardhat::extract_fit_engine(wf_linreg))[1], 3)` and the coefficient for $y_{tijk}$ is -$\widehat\alpha_1 =$ `r round(coef(extract_fit_engine(wf_linreg))[2], 3)`. +$\widehat\alpha_1 =$ `r round(coef(hardhat::extract_fit_engine(wf_linreg))[2], 3)`. The summary also tells us that all estimated coefficients are significantly different from zero. Extracting the 95% confidence intervals for the coefficients also leads us to