diff --git a/.Rbuildignore b/.Rbuildignore index ef09bbb..f544baf 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^pkgdown$ ^touchstone$ ^cran-comments\.md$ +^paper$ diff --git a/.gitignore b/.gitignore index 72cd8b5..fad5789 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,8 @@ README_cache .vscode/launch.json .pre-commit-config.yaml tests/README.md +paper/*.html +paper/*.pdf +paper/paper_cache/ +paper/paper_files/ +chitra/ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 300f1e7..8d5f85c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,14 +35,13 @@ Description: Builds contact matrices using GAMs and population data. This packag Electoral Commission and Australian Bureau of Statistics) 2020. License: MIT + file LICENSE Depends: - R (>= 2.10) + R (>= 4.1.0) Suggests: covr, knitr, vdiffr, testthat (>= 3.0.0), rmarkdown, - stringr, future, spelling, deSolve @@ -54,7 +53,7 @@ Language: en-US LazyData: true LazyDataCompression: xz Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: mgcv, dplyr (>= 1.0.9), @@ -75,6 +74,7 @@ Imports: vctrs, scales, english, - waldo + waldo, + stringr URL: https://github.com/idem-lab/conmat BugReports: https://github.com/idem-lab/conmat/issues diff --git a/NAMESPACE b/NAMESPACE index 5c73f74..ca40c8a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ export(abs_age_state) export(abs_age_work_lga) export(abs_age_work_state) export(abs_unabbreviate_states) +export(add_age_partial_sum) export(add_intergenerational) export(add_modelling_features) export(add_offset) @@ -68,8 +69,11 @@ export(apply_vaccination) export(as_conmat_population) export(as_setting_prediction_matrix) export(autoplot) +export(clean_term_names) export(conmat_population) +export(create_age_grid) export(estimate_setting_contacts) +export(extract_term_names) export(extrapolate_polymod) export(fit_setting_contacts) export(fit_single_contact_model) @@ -86,16 +90,21 @@ export(get_polymod_per_capita_household_size) export(get_polymod_population) export(get_polymod_setting_data) export(get_setting_transmission_matrices) +export(gg_age_partial_pred_long) +export(gg_age_partial_sum) +export(gg_age_terms_settings) export(matrix_to_predictions) export(new_age_matrix) export(new_ngm_setting_matrix) export(new_setting_data) export(per_capita_household_size) +export(pivot_longer_age_preds) export(polymod) export(population) export(population_label) export(predict_contacts) export(predict_contacts_1y) +export(predict_individual_terms) export(predict_setting_contacts) export(predictions_to_matrix) export(prepare_population_for_modelling) @@ -104,6 +113,14 @@ export(scaling) export(setting_prediction_matrix) export(transmission_probability_matrix) import(rlang) +importFrom(ggplot2,aes) importFrom(ggplot2,autoplot) +importFrom(ggplot2,coord_fixed) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) +importFrom(ggplot2,geom_tile) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,scale_fill_viridis_c) +importFrom(ggplot2,theme_minimal) importFrom(magrittr,"%>%") importFrom(stats,predict) diff --git a/R/checkers.R b/R/checkers.R index 8935c93..4f92cb2 100644 --- a/R/checkers.R +++ b/R/checkers.R @@ -142,13 +142,16 @@ check_if_var_numeric <- function(data, var, attribute) { } } -check_if_data_frame <- function(x) { +check_if_data_frame <- function(x, + arg = rlang::caller_arg(x), + call = rlang::caller_env()) { if (!is.data.frame(x)) { cli::cli_abort( - c( - "x must be a {.cls data.frame}", - "i" = "x is {.cls {class(x)}}" - ) + message = c( + "{.arg {arg}} must be a {.cls data.frame}", + "i" = "{.arg {arg}} is {.cls {class(x)}}" + ), + call = call ) } } diff --git a/R/partial-prediction-helpers.R b/R/partial-prediction-helpers.R new file mode 100644 index 0000000..da7cbcb --- /dev/null +++ b/R/partial-prediction-helpers.R @@ -0,0 +1,277 @@ +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param ages +#' @return +#' @author njtierney +#' @export +create_age_grid <- function(ages) { + ## TODO + ## Wrap this up into a function that generates an age grid data frame + ## with all the terms needed to fit a conmat model + ## (from `fit_single_contact_model.R`) + age_grid <- expand.grid( + age_from = ages, + age_to = ages + ) |> + tibble::as_tibble() |> + # prepare the age data so it has all the right column names + # that are used inside of `fit_single_contact_model()` + # conmat::add_symmetrical_features() |> + add_symmetrical_features() |> + # this ^^^ does the same as the commented part below: + # dplyr::mutate( + # gam_age_offdiag = abs(age_from - age_to), + # gam_age_offdiag_2 = abs(age_from - age_to)^2, + # gam_age_diag_prod = abs(age_from * age_to), + # gam_age_diag_sum = abs(age_from + age_to), + # gam_age_pmax = pmax(age_from, age_to), + # gam_age_pmin = pmin(age_from, age_to) + # ) |> + # This is to add the school_probability and work_probability columns + # that are used inside fit_single_contact_model() when fitting the model. + # conmat::add_modelling_features() + add_modelling_features() + + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param fit_home +#' @return +#' @author njtierney +#' @export +extract_term_names <- function(fit_home) { + + coef_names <- names(fit_home$coefficients) |> + stringr::str_remove_all("\\.[^.]*$") |> + unique() |> + stringr::str_subset("^s\\(") + + coef_names + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param term_names +#' @return +#' @author njtierney +#' @export +clean_term_names <- function(term_names) { + + term_names |> + stringr::str_remove_all("^s\\(gam_age_") |> + stringr::str_remove_all("\\)") + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_grid +#' @param term_names +#' @param term_var_names +#' @return +#' @author njtierney +#' @export +predict_individual_terms <- function(age_grid, fit, term_names, term_var_names) { + + predicted_term <- function(age_grid, fit, term_name, term_var_name){ + predict(object = fit, + newdata = age_grid, + type = "terms", + terms = term_name) |> + tibble::as_tibble() |> + setNames(glue::glue("pred_{term_var_name}")) + } + + all_predicted_terms <- purrr::map2_dfc( + .x = term_names, + .y = term_var_names, + .f = function(.x, .y){ + predicted_term(age_grid = age_grid, + fit = fit, + term_name = .x, + term_var_name = .y) + } + ) + + dplyr::bind_cols(age_grid, all_predicted_terms) + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_predictions_all_settings +#' @return +#' @author njtierney +#' @importFrom ggplot2 ggplot aes geom_tile facet_grid coord_fixed scale_fill_viridis_c theme_minimal facet_wrap +#' @export +gg_age_terms_settings <- function(age_predictions_all_settings) { + + pred_all_setting_longer <- age_predictions_all_settings |> + tidyr::pivot_longer( + dplyr::starts_with("pred"), + names_to = "pred", + values_to = "value", + names_prefix = "pred_" + ) |> + dplyr::select(age_from, + age_to, + value, + pred, + setting) + + facet_age_plot <- function(data, place){ + data |> + dplyr::filter(setting == place) |> + ggplot(aes(x = age_from, + y = age_to, + fill = value)) + + geom_tile() + + facet_grid(setting~pred, + switch = "y") + + coord_fixed() + + } + + p_home <- facet_age_plot(pred_all_setting_longer, "home") + + scale_fill_viridis_c() + p_work <- facet_age_plot(pred_all_setting_longer, "work") + + scale_fill_viridis_c(option = "rocket") + p_school <- facet_age_plot(pred_all_setting_longer, "school") + + scale_fill_viridis_c(option = "plasma") + p_other <- facet_age_plot(pred_all_setting_longer, "other") + + scale_fill_viridis_c(option = "mako") + + patchwork::wrap_plots( + p_home, + p_work, + p_school, + p_other, + nrow = 4 + ) + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_predictions +#' @return +#' @author njtierney +#' @export +pivot_longer_age_preds <- function(age_predictions) { + age_predictions |> + tidyr::pivot_longer( + dplyr::starts_with("pred"), + names_to = "pred", + values_to = "value", + names_prefix = "pred_" + ) +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_predictions_long +#' @return +#' @author njtierney +#' @export +gg_age_partial_pred_long <- function(age_predictions_long) { + + facet_names <- data.frame( + pred = c("diag_prod", "diag_sum", "offdiag", "offdiag_2", "pmax", "pmin"), + math_name = c("i x j", "i + j", "|i - j|", "|i - j|²", "max(i, j)", "min(i, j)") + ) + + age_predictions_long %>% + dplyr::left_join(facet_names, by = dplyr::join_by("pred")) %>% + ggplot( + aes( + x = age_from, + y = age_to, + group = math_name, + fill = value + ) + ) + + facet_wrap(~math_name, ncol = 3) + + geom_tile() + + scale_fill_viridis_c( + name = "log\ncontacts" + ) + + theme_minimal() + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_predictions_long +#' @return +#' @author njtierney +#' @export +add_age_partial_sum <- function(age_predictions_long) { + + age_partial_sum <- age_predictions_long |> + dplyr::group_by(age_from, + age_to) |> + dplyr::summarise( + gam_total_term = exp(sum(value)), + .groups = "drop" + ) + + age_partial_sum + +} + +#' .. content for \description{} (no empty lines) .. +#' +#' .. content for \details{} .. +#' +#' @title +#' @param age_predictions_long_sum +#' @return +#' @author njtierney +#' @export +gg_age_partial_sum <- function(age_predictions_long_sum) { + + ggplot( + data = age_predictions_long_sum, + aes( + x = age_from, + y = age_to, + fill = gam_total_term + ) + ) + + geom_tile() + + scale_fill_viridis_c( + name = "Num.\ncontacts", + option = "magma", + limits = c(0, 12) + ) + + theme_minimal() + +} + diff --git a/paper/paper.Rmd b/paper/paper.Rmd new file mode 100644 index 0000000..0391360 --- /dev/null +++ b/paper/paper.Rmd @@ -0,0 +1,251 @@ +--- +title: 'conmat: generate synthetic contact matrices for a given age-stratified population' +authors: +- affiliation: 1 + name: Nicholas Tierney + orcid: 0000-0003-1460-8722 +- affiliation: 1,2 + name: Nick Golding + orcid: 0000-0001-8916-5570 +- affiliation: 1,3 + name: Aarathy Babu + orcid: +- affiliation: 4 + name: Michael Lydeamore + orcid: 0000-0001-6515-827X +- affiliation: 1,3 + name: Chitra Saraswati + orcid: 0000-0002-8159-0414 +date: "03 May 2024" +output: + pdf_document: default + html_document: + keep_md: yes +bibliography: references.bib +tags: +- epidemiology +- R +- infectious disease +affiliations: +- index: 1 + name: Telethon Kids Institute +- index: 2 + name: Curtin University +- index: 3 + name: +- index: 4 + name: Monash University +--- + +```{r setup} +#| label: setup +#| echo: false +#| message: false +#| warning: false +#| comment: false +knitr::opts_chunk$set(comment = "#>", + echo = TRUE, + out.width = "95%", + retina = 3, + fig.height = 4, + fig.align = "center", + dpi = 300, + dev = "png") +options(tinytex.clean = FALSE) +``` + +```{r} +#| label: libraries +#| echo: FALSE + +library(purrr) +library(patchwork) +library(conmat) +library(ggplot2) +``` + +# Summary + +Contact matrices describe the number of contacts between individuals, typically age groups. They are used to create models of infectious disease spread. `conmat` is an R package which generates synthetic contact matrices for arbitrary input demography, ready for use in infectious diseases modelling. + +There are currently few options for a user to access synthetic contact matrices [@socialmixr; @prem2017]. Existing code to generate synthetic contact matrices from @prem2017 are not designed for replicability, and are restricted to only selected countries with no sub-national demographic estimates available. + +The software exposes model fitting and prediction separately to the user. Users can fit a model based on a contact survey such as POLYMOD [@mossong2008], then predict from this model to their own demographic data. This means users can generate synthetic contact matrices for any region, with any contact survey. + +We demonstrate a use-case for `conmat` by creating contact matrices for sub-national level (in this case, a state) in Australia. + +For users who do not wish to run the entire `conmat` pipeline, we have pre-generated synthetic contact matrices for 200 countries, based on a list of countries from the United Nations, using a model fit to the POLYMOD contact survey. These resulting synthetic contact matrices, and the associated code, can be found in the syncomat analysis pipeline ([GitHub](https://github.com/idem-lab/syncomat), [Zenodo](https://zenodo.org/records/11365943)). + +# Statement of need + +Infectious diseases like influenza and COVID19 spread via social contact. If we can understand patterns of contact - which people are more likely be in contact with each other - then we will be able to create models of how disease spreads. Epidemiologists and public policy makers can use these models to make decisions to keep a population safe and healthy. + +Empirical estimates of social contact are provided by social contact surveys. These provide samples of the frequency and type of social contact across different settings (home, work, school, other). + +An prominent contact survey is the "POLYMOD" study by @mossong2008, which surveyed 8 European countries: Belgium, Germany, Finland, Great Britain, Italy, Luxembourg, The Netherlands, and Poland [@mossong2008]. + +These social contact surveys can be projected on to a given demographic structure to produce estimated daily contact rates between age groups. These are known as contact matrices or synthetic contact matrices. A widely used approach by @prem2021 [@prem2021] produced contact matrices for 177 countries at "urban" and "rural" levels for each country. + +However, there were major limitations with the methods in @prem2021. First, not all countries were included in their analyses. Second, the contact matrices only covered broad scale areas. This presents challenges for decision makers who are often working at a sub-national geographical scale. Third, the code provided by Prem et al., was not designed for replicability and easy modification with user-defined inputs. + +The `conmat` package was developed to fill the specific need of creating contact matrices for arbitrary demographic structures. We developed the method primarily to output synthetic contact matrices. We also provided methods to create next generation matrices. + +# Example + +We will generate a contact matrix for Tasmania, a state in Australia, using a model fitted from the POLYMOD contact survey. We can get the age-stratified population data for Tasmania from the helper function `abs_age_state()`: + +```{r} +#| label: abs-age +tasmania <- abs_age_state("TAS") +tasmania +``` + +We can then generate a contact matrix for Tasmania using `extrapolate_polymod()`. + +```{r} +#| label: extrapolate-polymod +tasmania_contact <- extrapolate_polymod(population = tasmania) +tasmania_contact +``` + +We can plot the resulting contact matrix for Tasmania with `autoplot`: + +```{r} +#| label: autoplot-contacts +#| fig.width: 8 +#| fig.height: 8 +autoplot(tasmania_contact) +``` + +# Implementation + +The overall approach of `conmat` has two parts: + +1) Fit a model to predict individual contact rate, using an existing contact survey +2) Predict a synthetic contact matrix using age population data + +## Model fitting + +`conmat` was built to predict at four settings: work, school, home, and other. +One model is fitted for each setting. +Each model fitted is a Poisson generalised additive model (GAM) which predicts the count of contacts, with an offset for the log of participants. +The model has six covariates to explain six key features of the relationship between ages, +and two optional covariates for attendance at school or work. +The two optional covariates are included depending on which setting the model is fitted for. + +Each cell in the resulting contact matrix, indexed ($i$, $j$), is the predicted number of people in age group $j$ that a single individual in age group $i$ will have contact with per day. The sum over all of the $j$ age groups for a particular age group $i$ is the predicted total number of contacts per day for each individual of age group $i$. + +The six covariates are +$|i-j|$, +${|i-j|}^{2}$, +$i + j$, +$i \times j$, +$\text{max}(i, j)$ and +$\text{min}(i, j)$. + +These covariates capture typical features of inter-person contact, where individuals primarily interact with people of similar age (the diagonals of the matrix), and with grandparents and/or children (the so-called 'wings' of the matrix).The key features of the relationship between the age groups, represented by the six covariates, are displayed in \@ref(fig:partial-plots) for the home setting. The $|i-j|$ term gives the strong diagonal, modelling people generally living with similar age people, and the $\max(i,j)$ and $\min(i,j)$ terms give the intergenerational effect of parents and grandparents with children. + +```{r} +#| label: partial-plots +#| echo: FALSE +#| fig.cap: "Partial predictive plot (A) and overall synthetic contact matrix (B) for the Poisson GAM fitted to the POLYMOD contact survey in the home setting. The strong diagonal elements, and parents/grandparents interacting with children result in the classic 'diagonal with wings' shape." + +fit_home <- polymod_setting_models$home +age_grid <- create_age_grid(ages = 1:99) +term_names <- extract_term_names(fit_home) +term_var_names <- clean_term_names(term_names) +age_predictions <- predict_individual_terms( + age_grid = age_grid, + fit = fit_home, + term_names = term_names, + term_var_names = term_var_names +) + +age_predictions_all_settings <- map_dfr( + .x = polymod_setting_models, + .f = function(x) { + predict_individual_terms( + age_grid = age_grid, + fit = x, + term_names = term_names, + term_var_names = term_var_names + ) + }, + .id = "setting" +) + +plot_age_term_settings <- gg_age_terms_settings(age_predictions_all_settings) +age_predictions_long <- pivot_longer_age_preds(age_predictions) +plot_age_predictions_long <- gg_age_partial_pred_long(age_predictions_long) + + coord_equal() + + labs(x = "Age from", + y = "Age to") + + theme( + legend.position = "bottom" + ) + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = c(0,0)) + + expand_limits(x = c(0, 100), y = c(0, 100)) + +age_predictions_long_sum <- add_age_partial_sum(age_predictions_long) +plot_age_predictions_sum <- gg_age_partial_sum(age_predictions_long_sum) + coord_equal() + + labs(x = "Age from", + y = "Age to") + + theme( + legend.position = "bottom" + ) + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = c(0,0)) + + expand_limits(x = c(0, 100), y = c(0, 100)) +``` + + +```{r} +#| echo: FALSE + +plot_all_terms_sum <- plot_age_predictions_long + + plot_age_predictions_sum + + plot_layout(design = " + AAAABBBB + AAAABBBB + AAAABBBB + AAAABBBB + ") + + plot_annotation( + tag_levels = "A" + ) + + +plot_all_terms_sum +``` + +Visualising the partial predictive plots for other settings (school, work and other), shows patterns that correspond with real-life situations are observed. A full visualisation pipeline is available at https://idem-lab.github.io/conmat/dev/articles/visualising-conmat.html + +# Conclusions and future directions + +The `conmat` software provides a flexible interface to generating synthetic contact matrices using population data and contact surveys. These contact matrices can then be used in infectious disease modelling and surveillance. + +The main strength of `conmat` is its interface requiring only age population data to create a synthetic contact matrix. Current approaches provide only a selection of country level contact matrices. This software can predict to arbitrary demography, such as sub-national, or simulated populations. + +We provide a trained model of contact rate that is fit to the POLYMOD survey for ease of use. The software also has an interface to use other contact surveys, such as @comix. This is important as POLYMOD represents contact patterns in 8 countries in Europe, and contact patterns are known to differ across nations and cultures. + +The covariates used by conmat were designed to represent the key features that are typically present in a contact matrix for different settings (work, school, home, other). Including other sources of information that may better describe these contact patterns, such as inter-generational mixing, or differences in school ages of a local demographic, may improve model performance. + +The interface to the model formula in conmat is fixed; users cannot change the covariates of the model. This means that if there is an unusual structure in their contact data it might not be accurately captured by conmat. It was a design decision that was made to focus on the key feature of conmat: using just age population data to predict a contact matrix. + +Public health decisions are often based on age specific information, which means the more accurate your age specific models are, the better those decisions are likely to be. This is the first piece of software that will provide appropriate contact matrices for a population, which means more accurate models of disease. + +This work was used as a key input into several models for COVID-19 transmission and control in Australia and contributed to decisions around vaccination policy [@DohertyModelling]. + +Some future directions for this software include: + +* Demonstrate fitting a model to other contact surveys from sources such as the `socialmixr` R package [@socialmixr]. +* Predict to any age brackets - such as monthly ages, for example, 1, 3, 6, month year old infants +* Fitting to multiple contact surveys simultaneously, e.g., POLYMOD and CoMix +* Add uncertainty to estimates +* Add methods for including household size distributions +* Include known household age distributions as offsets in the 'home' setting model, in place of the whole population distribution. + +Software is never finished, and the software in its current format has proven useful for infectious disease modelling. In time we hope it can become more widely used and be useful for more applications in epidemiology and public health. + +# References diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..f0a2b85 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,276 @@ +--- +title: 'conmat: generate synthetic contact matrices for a given age-stratified population' +authors: +- affiliation: 1 + name: Nicholas Tierney + orcid: 0000-0003-1460-8722 +- affiliation: 1,2 + name: Nick Golding + orcid: 0000-0001-8916-5570 +- affiliation: 1,3 + name: Aarathy Babu + orcid: +- affiliation: 4 + name: Michael Lydeamore + orcid: 0000-0001-6515-827X +- affiliation: 1,3 + name: Chitra Saraswati + orcid: 0000-0002-8159-0414 +date: "03 May 2024" +output: + html_document: + keep_md: yes + pdf_document: default +bibliography: references.bib +tags: +- epidemiology +- R +- infectious disease +affiliations: +- index: 1 + name: Telethon Kids Institute +- index: 2 + name: Curtin University +- index: 3 + name: +- index: 4 + name: Monash University +--- + + + +# Summary + +[ A summary describing the high-level functionality and purpose of the software for a diverse, non-specialist audience. ] + +This article introduces `conmat`, an R package which generates synthetic contact matrices. + +There are currently few options for a user to generate their own synthetic contact matrices. +Existing methods to generate synthetic contact matrices are not designed for replicability, do not have enough granularity, and does not cover enough administrative areas (in other words, some countries are not included). + +[ What's different and useful about conmat? ] +Users might have their own contact survey data that they would like to generate synthetic contact matrices from. +Perhaps the population demography is different, or the contact rates varying. +A higher level of granularity however is sometimes required to make public health decisions for a given population. + +`conmat` also provides flexibility, in that it allows users to specify the area in which they would like the contact matrices to be generated; it allows users to specify their own age groups and population structures; it allows users to upload their own contact surveys to fit the model on; and it allows users to generate the contact matrices at different settings. + +[ What else is covered in this paper? ] An example use-case for `conmat` is provided for a local government area (i.e. at the sub-national level) in Australia. +Also provided is an analysis pipeline to support conmat, [`syncomat`](https://github.com/idem-lab/syncomat), which generates synthetic contact matrices for 200 countries. + +# Statement of need + +[ A Statement of need section that clearly illustrates the research purpose of the software and places it in the context of related work. ] + +#TODO - A better first sentence that encapsulates conmat use? +Understanding the dynamics of infectious disease transmission is an important task (?) for epidemiologists and public policy makers. +Identifying vulnerable groups and predicting disease transmission (?)dynamics / how diseases spread are essential for informed public health decision-making. +Infectious diseases such as influenza and coronavirus spread through human-to-human interactions, or in other words, "social contact". Quantifying social contact and its patterns can provide critical insights into how these diseases spread. [ Is this circular? ] / and how best to mitigate the spread of these diseases. + +We can measure social contact through social contact surveys, where people describe the number and type of social contact they have. These surveys provide (?) a measure of contact rates: an empirical estimate of the number of social contacts from one age group to another and the setting of contact. For example, we might learn from a contact survey that homes have higher contact between 25-50 year olds and 0-15 year olds, whereas workplaces might have high contact within 25-60 year olds. + +These social contact surveys exist for a few countries. As an example, the "POLYMOD" study by @mossong2008 covered 8 European countries: Belgium, Germany, Finland, Great Britain, Italy, Luxembourg, The Netherlands, and Poland [@mossong2008]. However, what do we do when we want to estimate contact rates in other countries where this is not yet measured? We can use existing data--the contact rates obtained from contact surveys--to help us project / predict these estimates to countries or places that do not have them. These are called "synthetic contact matrices". A popular approach by @prem2017 projected contact rates from the POLYMOD study to 152 countries. This was later updated to include synthetic contact matrices for 177 countries at "urban" and "rural" levels for each country [@prem2021]. +[ #TODO is project or predict a better word? Does it matter? ] + +However, there were major limitations with the methods in @prem2021. First, not all countries were included in their analyses. Second, some of the synthetic contact matrices did not have enough granularity; in other words, they covered areas that are too large, such as the "urban" or "rural" parts of a country. This is disadvantageous as public health groups might need to make predictions for more fine-grained areas within a country, such as a district or municipality. Third, the methodology used by Prem et al. was challenging to reuse in other contexts. Prem et al. provided the code used for their analysis, but that code was not designed for replicability and easy modification with user-defined inputs. + +[REVISED PARAGRAPH BELOW] The `conmat` package was created to fill a specific need for creating synthetic contact matrices for specific local government areas for Australia, for work commissioned by the Australian government. We created methods and software to facilitate the following: + +The `conmat` package was developed to fill the specific need of creating synthetic contact matrices for local government areas in Australia. This package is used for [ #TODO what work, specifically? Health? Provide example. Or is *this* package commissioned by the Aus govt? ] work commissioned by the Australian government. +We developed methods and software to facilitate the following tasks. + +- Generate, as output, synthetic contact matrices from age-stratfied population data. +- Create next generation matrices (NGMs). +- Apply vaccination reduction to NGMs. +- Use NGMs in disease modelling. +- Provide tidied population data from the Australian Bureau of Statistics. + +# Example + +As an example, let us generate a contact matrix for a local government area within Australia, using a model fitted from the POLYMOD data. + +Suppose we want to generate a contact matrix for the City of Perth. We can get the age-stratified population data for Perth from the helper function `abs_age_lga`: + + +``` r +library(conmat) +perth <- abs_age_lga("Perth (C)") +perth +``` + +``` +#> # A tibble: 18 × 4 (conmat_population) +#> - age: lower.age.limit +#> - population: population +#> lga lower.age.limit year population +#> +#> 1 Perth (C) 0 2020 1331 +#> 2 Perth (C) 5 2020 834 +#> 3 Perth (C) 10 2020 529 +#> 4 Perth (C) 15 2020 794 +#> 5 Perth (C) 20 2020 3615 +#> 6 Perth (C) 25 2020 5324 +#> 7 Perth (C) 30 2020 4667 +#> 8 Perth (C) 35 2020 3110 +#> 9 Perth (C) 40 2020 1650 +#> 10 Perth (C) 45 2020 1445 +#> 11 Perth (C) 50 2020 1299 +#> 12 Perth (C) 55 2020 1344 +#> 13 Perth (C) 60 2020 1359 +#> 14 Perth (C) 65 2020 1145 +#> 15 Perth (C) 70 2020 1004 +#> 16 Perth (C) 75 2020 673 +#> 17 Perth (C) 80 2020 481 +#> 18 Perth (C) 85 2020 367 +``` + +We can then generate a contact matrix for `perth` using the `extrapolate_polymod` function, where the contact matrix is generated using a model fitted from the POLYMOD data. + + +``` r +perth_contact <- extrapolate_polymod(population = perth) +perth_contact +``` + +``` +#> +``` + +``` +#> ── Setting Prediction Matrices ───────────────────────────────────────────────── +``` + +``` +#> A list of matrices containing the model predicted contact rate between ages in +#> each setting. +``` + +``` +#> There are 16 age breaks, ranging 0-75+ years, with a regular 5 year interval +``` + +``` +#> • home: a 16x16 +``` + +``` +#> • work: a 16x16 +``` + +``` +#> • school: a 16x16 +``` + +``` +#> • other: a 16x16 +``` + +``` +#> • all: a 16x16 +``` + +``` +#> ℹ Access each with `x$name` +``` + +``` +#> ℹ e.g., `x$home` +``` + +We can plot the resulting contact matrix for Perth with `autoplot`: + + +``` r +autoplot(perth_contact) +``` + + + + +# Implementation + +`conmat` was built to predict at four settings: work, school, home, and other. +One model is fitted for each setting. +Each model fitted is a Poisson generalised additive model (GAM) which predicts the count of contacts, with an offset for the log of participants. +The model has six (?)covariates/terms to explain six key features of the relationship between ages, +and two optional terms for attendance at school or work. +The two optional terms are included depending on which setting the model is fitted for. + +Each cell in the resulting contact matrix, indexed *i*, *j*, is the predicted number of people in age group *j* that a single individual in age group *i* will have contact with per day. If you sum across all the *j* age groups for each *i* age group, you get the predicted total number of contacts per day for each individual of age group *i*. [ #TODO expected, predicted, or average? Does it matter? ] + +The six terms are +$|i-j|$, +${|i-j|}^{2}$, +$i + j$, +$i \times j$, +$\text{max}(i, j)$ and +$\text{min}(i, j)$. + +The six key features of the relationship between the age groups, represented by the six terms, are displayed in the figure below. +[ #TODO notes-to-self: the model structure wasn't generated through any particularly robust process, it was just coming up with structures that looked mildly appropriate for our use case. ] + + +``` r +# Show partial dep plot of the six main terms +``` + +Note that these partial dependency plots are on the log scale. +When the six terms are added up together for each setting (in other words, each model) and exponentiated, they show the following patterns: + + +``` r +# Show combined partial dep plot (i.e. sum of the partial dependencies for all six terms) in each setting: home, school, work and other +``` + +In other words, the six terms above provide patterns that are useful in modelling the different settings, +and correspond with real-life situations of how contact would look like. +In the home setting for example, [ #TODO describe how children interact with parents and elderly generation, grandparents ]. +When the terms for school and work are added, these terms also provide patterns that correspond with real-life situations. +https://idem-lab.github.io/conmat/dev/articles/visualising-conmat.html +In the school setting, children tend to contact other children in the same age groups as them. +In the work setting, there are no contacts with children under the age of ten and minimal contact with adults beyond retirement age. + +One of the issues with the contact matrices generated by @prem2017 is that some countries are missing. To remedy this we generated synthetic contact matrices for 200 countries, based on a list of country names by the UN, fitted on the POLYMOD contact surveys. +We also ensured that the analysis pipeline is reproducible and transparent by utilising a targets workflow, which allows ease of editing for users. +The resulting synthetic contact matrices, and a replicable / extensible (?) analysis pipeline, can be found in the syncomat analysis pipeline ([GitHub](https://github.com/idem-lab/syncomat), [Zenodo](https://zenodo.org/records/11365943)). + +## Model interfaces + +We provide functions for model fitting at various use cases. Further detail for each of the following functions are available at: https://idem-lab.github.io/conmat/dev/ + +* `fit_single_contact_model()` + * Fits a generalised additive model (GAM) using contact survey data and population size information. This function is recommended when you want to fit a model to only one setting, for which you might want to provide your own contact survey data. + +* `predict_contacts()` + + * This takes a fitted model from `fit_single_contact_model()` and predicts [ #TODO what is predicted? ] to a provided (?) population structure. + +* `fit_setting_contacts()` + * Fits the `fit_single_contact_model()` to each setting. This function is useful for when you have multiple settings to fit. Returns a list of fitted models. + +* `predict_setting_contacts()` + * Takes a list of fitted models from `fit_setting_contacts()` and predicts [ #TODO what is predicted? ] to a given population for each setting. + +* `estimate_setting_contacts()` + * A convenience function that fits multiple models, one for each setting. This means fitting `fit_setting_contacts()` and then `predict_setting_contacts()`. Recommended for when you have multiple settings to fit and want to predict to a given population as well. + +* `extrapolate_polymod()` + * Takes population information and projects pre-fit model from POLYMOD - used for speed when you know you want to take an already fit model from POLYMOD and just fit it to your provided population. + +[ #TODO for the above it's good to explain what exactly is predicted. Otherwise it's confusing for the user to understand what each of the model outputs? ] + +# Conclusions and future directions + +Our future direction for `conmat` includes adding the following functionalities: +* Create a contact matrix using a custom contact survey from another source, such as the `socialmixr` R package. +* Predict to any age brackets - such as monthly ages, for example, 1, 3, 6, month year old infants +* Add ability to fit multiple contact surveys at once, e.g., POLYMOD and another dataset +* Add ability to include known household age distributions as offsets in the 'home' setting model, in place of the whole population distribution. So compute household age matrices (like age-structured contact matrices, but for household members instead of contacts) from POLYMOD data. If we compute a different one for each household size, in the POLYMOD data (probably estimated with another GAM, to make best use of the limited data) we might be able to extrapolate household age matrices to new countries based on the distribution of household sizes. +* Add methods for including household size distributions +* Add uncertainty to estimates +* Move Australian centric data into its own package +* Add documentation on specifying your own GAM model and using this workflow + +[ #TODO Change concluding sentence. The following is copied ad verbatim from JSS bizicount because I like it ] For now, however, we feel that our base `bizicount` package is sufficiently general to assist in estimating the models most often encountered by applied researchers. + +# References diff --git a/paper/paper.pdf.md b/paper/paper.pdf.md new file mode 100644 index 0000000..bb1573b --- /dev/null +++ b/paper/paper.pdf.md @@ -0,0 +1,340 @@ +--- +title: '`conmat`: generate synthetic contact matrices for a given age-stratified population' +authors: +- affiliation: 1 + name: Nicholas Tierney + orcid: 0000-0003-1460-8722 +- affiliation: 1,2 + name: Nick Golding + orcid: 0000-0001-8916-5570 +- affiliation: 1,3 + name: Aarathy Babu + orcid: +- affiliation: 4 + name: Michael Lydeamore + orcid: 0000-0001-6515-827X +- affiliation: 1,3 + name: Chitra Saraswati + orcid: 0000-0002-8159-0414 +date: today +bibliography: references.bib +cite-method: biblatex +tags: +- epidemiology +- R +- infectious disease +affiliations: +- index: 1 + name: Telethon Kids Institute +- index: 2 + name: Curtin University +- index: 3 + name: +- index: 4 + name: Monash University +execute: + echo: true + cache: false +format: + pdf: + keep-md: true + fig-height: 4 + fig-align: center + fig-format: png + dpi: 300 + html: + keep-md: true + fig-height: 4 + fig-align: center + fig-format: png + dpi: 300 +--- + + +::: {.cell} + +::: + +::: {.cell} + +::: + + + + +# Summary + +Contact matrices describe the number of contacts between individuals, typically age groups. They are used to create models of infectious disease spread. `conmat` is an R package which generates synthetic contact matrices for arbitrary input demography, ready for use in infectious diseases modelling. + +There are currently few options for a user to access synthetic contact matrices [@socialmixr; @prem2017]. Existing code to generate synthetic contact matrices from @prem2017 are not designed for replicability, are restricted to only selected countries, and provide no sub-national demographic estimates. + +The `conmat` package exposes model fitting and prediction separately to the user. Users can fit a model based on a contact survey such as POLYMOD [@mossong2008], then predict from this model to their own demographic data. This means users can generate synthetic contact matrices for any region, with any contact survey. + +We demonstrate a use-case for `conmat` by creating contact matrices for sub-national level (in this case, a state) in Australia. + +For users who do not wish to run the entire `conmat` pipeline, we have pre-generated synthetic contact matrices for 200 countries, based on a list of countries from the United Nations, using a model fit to the POLYMOD contact survey. These resulting synthetic contact matrices, and the associated code, can be found in the syncomat analysis pipeline ([GitHub](https://github.com/idem-lab/syncomat), [Zenodo](https://zenodo.org/records/11365943)) [@syncomat]. + +# Statement of need + +Infectious diseases like influenza and COVID19 spread via social contact. If we can understand patterns of contact - which people are more likely be in contact with each other - then we will be able to create models of how disease spreads. Epidemiologists and public policy makers can use these models to make decisions to keep a population safe and healthy. + +Empirical estimates of social contact are provided by social contact surveys. These provide samples of the frequency and type of social contact across different settings (home, work, school, other). + +An prominent contact survey is the "POLYMOD" study by @mossong2008, which surveyed 8 European countries: Belgium, Germany, Finland, Great Britain, Italy, Luxembourg, The Netherlands, and Poland [@mossong2008]. + +These social contact surveys can be projected on to a given demographic structure to produce estimated daily contact rates between age groups. These are known as "contact matrices" or "synthetic contact matrices". A widely used approach by @prem2017 [@prem2021] produced contact matrices for 177 countries at "urban" and "rural" levels for each country. + +However, there were major limitations with the methods in @prem2021. First, not all countries were included in their analyses. Second, the contact matrices only covered broad scale areas. This presents challenges for decision makers who are often working at a sub-national geographical scale. Third, the code provided by Prem et al., was not designed for replicability and easy modification with user-defined inputs. + +The `conmat` package was developed to fill the specific need of creating contact matrices for arbitrary demographic structures. We developed the method primarily to output synthetic contact matrices. We also provided methods to create next generation matrices. + +# Example + +We will generate a contact matrix for Tasmania, a state in Australia, using a model fitted from the POLYMOD contact survey. We can get the age-stratified population data for Tasmania from the Australian Bureau of Statistics (ABS) with the helper function, `abs_age_state()`: + + + + +::: {.cell} + +```{.r .cell-code} +tasmania <- abs_age_state("TAS") +head(tasmania) +``` + +::: {.cell-output .cell-output-stdout} + +``` +# A tibble: 6 x 4 (conmat_population) + - age: lower.age.limit + - population: population + year state lower.age.limit population + +1 2020 TAS 0 29267 +2 2020 TAS 5 31717 +3 2020 TAS 10 33318 +4 2020 TAS 15 31019 +5 2020 TAS 20 31641 +6 2020 TAS 25 34115 +``` + + +::: +::: + + + + +We can then generate a contact matrix for Tasmania, from the POLYMOD study with `extrapolate_polymod()`. + + + + +::: {.cell} + +```{.r .cell-code} +tasmania_contact <- extrapolate_polymod(population = tasmania) +tasmania_contact +``` + +::: {.cell-output .cell-output-stderr} + +``` + +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +-- Setting Prediction Matrices ------------------------------------------------- +``` + + +::: + + +::: {.cell-output .cell-output-stderr} + +``` +A list of matrices containing the model predicted contact rate between ages in +each setting. +``` + + +::: + + +::: {.cell-output .cell-output-stderr} + +``` +There are 16 age breaks, ranging 0-75+ years, with a regular 5 year interval +``` + + +::: + + +::: {.cell-output .cell-output-stderr} + +``` +* home: a 16x16 +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +* work: a 16x16 +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +* school: a 16x16 +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +* other: a 16x16 +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +* all: a 16x16 +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +i Access each with `x$name` +``` + + +::: + +::: {.cell-output .cell-output-stderr} + +``` +i e.g., `x$home` +``` + + +::: +::: + + + + +We can plot the resulting contact matrix for Tasmania with `autoplot`: + + + + +::: {.cell} + +```{.r .cell-code} +autoplot(tasmania_contact) +``` + +::: {.cell-output-display} +![](paper_files/figure-pdf/autoplot-contacts-1.png){fig-pos='H'} +::: +::: + + + + +# Implementation + +The overall approach of `conmat` has two parts: + +1) Fit a model to predict individual contact rate, using an existing contact survey +2) Predict a synthetic contact matrix using age population data + +## Model fitting + +`conmat` was built to predict at four settings: work, school, home, and other. +One model is fitted for each setting. +Each model fitted is a Poisson generalised additive model (GAM) with a log link function, which predicts the count of contacts, with an offset for the log of participants. +The model has six covariates to explain six key features of the relationship between ages, +and two optional covariates for attendance at school or work. +The two optional covariates are included depending on which setting the model is fitted for. + +Each cell in the resulting contact matrix (after back-transformation of the link function), indexed ($i$, $j$), is the predicted number of people in age group $j$ that a single individual in age group $i$ will have contact with per day. The sum over all of the $j$ age groups for a particular age group $i$ is the predicted total number of contacts per day for each individual of age group $i$. + +The six covariates are: + +- $|i-j|$, +- ${|i-j|}^{2}$, +- $i + j$, +- $i \times j$, +- $\text{max}(i, j)$ and +- $\text{min}(i, j)$. + +These covariates capture typical features of inter-person contact, where individuals primarily interact with people of similar age (the diagonals of the matrix), and with grandparents and/or children (the so-called 'wings' of the matrix).The key features of the relationship between the age groups, represented by the six covariates, are displayed in \@ref(fig:partial-plots) for the home setting. The $|i-j|$ term gives the strong diagonal, modelling people generally living with similar age people, and the $\max(i,j)$ and $\min(i,j)$ terms give the intergenerational effect of parents and grandparents with children. + + + + +::: {.cell} + +::: + +::: {.cell} +::: {.cell-output-display} +![Partial predictive plot (A) and overall synthetic contact matrix (B) for the Poisson GAM fitted to the POLYMOD contact survey in the home setting. The strong diagonal elements, and parents/grandparents interacting with children result in the classic 'diagonal with wings' shape.](paper_files/figure-pdf/show-partial-plots-1.png) +::: +::: + + + + +Visualising the partial predictive plots for other settings (school, work and other), shows patterns that correspond with real-life situations are observed. A full visualisation pipeline is available at https://idem-lab.github.io/conmat/dev/articles/visualising-conmat.html + +# Conclusions and future directions + +The `conmat` software provides a flexible interface to generating synthetic contact matrices using population data and contact surveys. These contact matrices can then be used in infectious disease modelling and surveillance. + +The main strength of `conmat` is its interface requiring only age population data to create a synthetic contact matrix. Current approaches provide only a selection of country level contact matrices. This software can predict to arbitrary demography, such as sub-national, or simulated populations. + +We provide a trained model of contact rate that is fit to the POLYMOD survey for ease of use. The software also has an interface to use other contact surveys, such as @comix. This is important as POLYMOD represents contact patterns in 8 countries in Europe, and contact patterns are known to differ across nations and cultures. + +The covariates used by `conmat` were designed to represent the key features that are typically present in a contact matrix for different settings (work, school, home, other). Including other sources of information that may better describe these contact patterns, such as inter-generational mixing, or differences in school ages of a local demographic, may improve model performance. + +The interface to the model formula in `conmat` is fixed; users cannot change the covariates of the model. This means if there is an unusual structure in their contact data it might not be accurately captured by `conmat`. This fixed formula was a design decision made to focus on the key feature of `conmat`: using just age population data to predict a contact matrix. + +Public health decisions are often based on age specific information, which means the more accurate your age specific models are, the better those decisions are likely to be. This is the first piece of software that will provide appropriate contact matrices for a population, which means more accurate models of disease. + +This work was used as a key input into several models for COVID-19 transmission and control in Australia and contributed to decisions around vaccination policy [@DohertyModelling]. + +Some future directions for this software include: + +* Demonstrate fitting a model to other contact surveys from sources such as the `socialmixr` R package [@socialmixr]. +* Predict to any age brackets - such as monthly ages, for example, 1, 3, 6, month year old infants +* Fitting to multiple contact surveys simultaneously, e.g., POLYMOD and CoMix +* Add uncertainty to estimates +* Add methods for including household size distributions +* Include known household age distributions as offsets in the 'home' setting model, in place of the whole population distribution. + +Software is never finished, and the software in its current format has proven useful for infectious disease modelling. In time we hope it can become more widely used and be useful for more applications in epidemiology and public health. + +# References diff --git a/paper/paper.qmd b/paper/paper.qmd new file mode 100644 index 0000000..b9aa026 --- /dev/null +++ b/paper/paper.qmd @@ -0,0 +1,261 @@ +--- +title: '`conmat`: generate synthetic contact matrices for a given age-stratified population' +authors: +- affiliation: 1 + name: Nicholas Tierney + orcid: 0000-0003-1460-8722 +- affiliation: 1,2 + name: Nick Golding + orcid: 0000-0001-8916-5570 +- affiliation: 1,3 + name: Aarathy Babu + orcid: +- affiliation: 4 + name: Michael Lydeamore + orcid: 0000-0001-6515-827X +- affiliation: 1,3 + name: Chitra Saraswati + orcid: 0000-0002-8159-0414 +date: today +bibliography: references.bib +cite-method: biblatex +tags: +- epidemiology +- R +- infectious disease +affiliations: +- index: 1 + name: Telethon Kids Institute +- index: 2 + name: Curtin University +- index: 3 + name: +- index: 4 + name: Monash University +execute: + echo: true + cache: false +format: + pdf: + keep-md: true + fig-height: 4 + fig-align: center + fig-format: png + dpi: 300 + html: + keep-md: true + fig-height: 4 + fig-align: center + fig-format: png + dpi: 300 +--- + +```{r} +#| label: setup +#| echo: false +#| message: false +#| warning: false +options(tinytex.clean = FALSE) +``` + +```{r} +#| label: libraries +#| echo: false +#| output: false + +library(purrr) +library(patchwork) +library(conmat) +library(ggplot2) +``` + +# Summary + +Contact matrices describe the number of contacts between individuals, typically age groups. They are used to create models of infectious disease spread. `conmat` is an R package which generates synthetic contact matrices for arbitrary input demography, ready for use in infectious diseases modelling. + +There are currently few options for a user to access synthetic contact matrices [@socialmixr; @prem2017]. Existing code to generate synthetic contact matrices from @prem2017 are not designed for replicability, are restricted to only selected countries, and provide no sub-national demographic estimates. + +The `conmat` package exposes model fitting and prediction separately to the user. Users can fit a model based on a contact survey such as POLYMOD [@mossong2008], then predict from this model to their own demographic data. This means users can generate synthetic contact matrices for any region, with any contact survey. + +We demonstrate a use-case for `conmat` by creating contact matrices for sub-national level (in this case, a state) in Australia. + +For users who do not wish to run the entire `conmat` pipeline, we have pre-generated synthetic contact matrices for 200 countries, based on a list of countries from the United Nations, using a model fit to the POLYMOD contact survey. These resulting synthetic contact matrices, and the associated code, can be found in the syncomat analysis pipeline ([GitHub](https://github.com/idem-lab/syncomat), [Zenodo](https://zenodo.org/records/11365943)) [@syncomat]. + +# Statement of need + +Infectious diseases like influenza and COVID19 spread via social contact. If we can understand patterns of contact - which people are more likely be in contact with each other - then we will be able to create models of how disease spreads. Epidemiologists and public policy makers can use these models to make decisions to keep a population safe and healthy. + +Empirical estimates of social contact are provided by social contact surveys. These provide samples of the frequency and type of social contact across different settings (home, work, school, other). + +An prominent contact survey is the "POLYMOD" study by @mossong2008, which surveyed 8 European countries: Belgium, Germany, Finland, Great Britain, Italy, Luxembourg, The Netherlands, and Poland [@mossong2008]. + +These social contact surveys can be projected on to a given demographic structure to produce estimated daily contact rates between age groups. These are known as "contact matrices" or "synthetic contact matrices". A widely used approach by @prem2017 [@prem2021] produced contact matrices for 177 countries at "urban" and "rural" levels for each country. + +However, there were major limitations with the methods in @prem2021. First, not all countries were included in their analyses. Second, the contact matrices only covered broad scale areas. This presents challenges for decision makers who are often working at a sub-national geographical scale. Third, the code provided by Prem et al., was not designed for replicability and easy modification with user-defined inputs. + +The `conmat` package was developed to fill the specific need of creating contact matrices for arbitrary demographic structures. We developed the method primarily to output synthetic contact matrices. We also provided methods to create next generation matrices. + +# Example + +We will generate a contact matrix for Tasmania, a state in Australia, using a model fitted from the POLYMOD contact survey. We can get the age-stratified population data for Tasmania from the Australian Bureau of Statistics (ABS) with the helper function, `abs_age_state()`: + +```{r} +#| label: abs-age +tasmania <- abs_age_state("TAS") +head(tasmania) +``` + +We can then generate a contact matrix for Tasmania, from the POLYMOD study with `extrapolate_polymod()`. + +```{r} +#| label: extrapolate-polymod +tasmania_contact <- extrapolate_polymod(population = tasmania) +tasmania_contact +``` + +We can plot the resulting contact matrix for Tasmania with `autoplot`: + +```{r} +#| label: autoplot-contacts +#| fig-width: 8 +#| fig-height: 8 +autoplot(tasmania_contact) +``` + +# Implementation + +The overall approach of `conmat` has two parts: + +1) Fit a model to predict individual contact rate, using an existing contact survey +2) Predict a synthetic contact matrix using age population data + +## Model fitting + +`conmat` was built to predict at four settings: work, school, home, and other. +One model is fitted for each setting. +Each model fitted is a Poisson generalised additive model (GAM) with a log link function, which predicts the count of contacts, with an offset for the log of participants. +The model has six covariates to explain six key features of the relationship between ages, +and two optional covariates for attendance at school or work. +The two optional covariates are included depending on which setting the model is fitted for. + +Each cell in the resulting contact matrix (after back-transformation of the link function), indexed ($i$, $j$), is the predicted number of people in age group $j$ that a single individual in age group $i$ will have contact with per day. The sum over all of the $j$ age groups for a particular age group $i$ is the predicted total number of contacts per day for each individual of age group $i$. + +The six covariates are: + +- $|i-j|$, +- ${|i-j|}^{2}$, +- $i + j$, +- $i \times j$, +- $\text{max}(i, j)$ and +- $\text{min}(i, j)$. + +These covariates capture typical features of inter-person contact, where individuals primarily interact with people of similar age (the diagonals of the matrix), and with grandparents and/or children (the so-called 'wings' of the matrix).The key features of the relationship between the age groups, represented by the six covariates, are displayed in \@ref(fig:partial-plots) for the home setting. The $|i-j|$ term gives the strong diagonal, modelling people generally living with similar age people, and the $\max(i,j)$ and $\min(i,j)$ terms give the intergenerational effect of parents and grandparents with children. + +```{r} +#| label: partial-plots-create +#| echo: false + +fit_home <- polymod_setting_models$home +age_grid <- create_age_grid(ages = 1:99) +term_names <- extract_term_names(fit_home) +term_var_names <- clean_term_names(term_names) +age_predictions <- predict_individual_terms( + age_grid = age_grid, + fit = fit_home, + term_names = term_names, + term_var_names = term_var_names +) + +age_predictions_all_settings <- map_dfr( + .x = polymod_setting_models, + .f = function(x) { + predict_individual_terms( + age_grid = age_grid, + fit = x, + term_names = term_names, + term_var_names = term_var_names + ) + }, + .id = "setting" +) + +plot_age_term_settings <- gg_age_terms_settings(age_predictions_all_settings) +age_predictions_long <- pivot_longer_age_preds(age_predictions) +plot_age_predictions_long <- gg_age_partial_pred_long(age_predictions_long) + + coord_equal() + + labs( + x = "Age from", + y = "Age to" + ) + + theme( + legend.position = "bottom", + axis.text = element_text(size = 6), + panel.spacing = unit(x = 1, units = "lines") + ) + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = c(0,0)) + + expand_limits(x = c(0, 100), y = c(0, 100)) + +age_predictions_long_sum <- add_age_partial_sum(age_predictions_long) +plot_age_predictions_sum <- gg_age_partial_sum(age_predictions_long_sum) + coord_equal() + + labs(x = "Age from", + y = "Age to") + + theme( + legend.position = "bottom" + ) + + scale_x_continuous(expand = c(0,0)) + + scale_y_continuous(expand = c(0,0)) + + expand_limits(x = c(0, 100), y = c(0, 100)) +``` + + +```{r} +#| label: show-partial-plots +#| echo: false +#| fig-cap: "Partial predictive plot (A) and overall synthetic contact matrix (B) for the Poisson GAM fitted to the POLYMOD contact survey in the home setting. The strong diagonal elements, and parents/grandparents interacting with children result in the classic 'diagonal with wings' shape." +plot_all_terms_sum <- plot_age_predictions_long + + plot_age_predictions_sum + + plot_layout(design = " + AAAAABBBB + AAAAABBBB + AAAAABBBB + AAAAABBBB + ") + + plot_annotation( + tag_levels = "A" + ) + + +plot_all_terms_sum +``` + +Visualising the partial predictive plots for other settings (school, work and other), shows patterns that correspond with real-life situations are observed. A full visualisation pipeline is available at https://idem-lab.github.io/conmat/dev/articles/visualising-conmat.html + +# Conclusions and future directions + +The `conmat` software provides a flexible interface to generating synthetic contact matrices using population data and contact surveys. These contact matrices can then be used in infectious disease modelling and surveillance. + +The main strength of `conmat` is its interface requiring only age population data to create a synthetic contact matrix. Current approaches provide only a selection of country level contact matrices. This software can predict to arbitrary demography, such as sub-national, or simulated populations. + +We provide a trained model of contact rate that is fit to the POLYMOD survey for ease of use. The software also has an interface to use other contact surveys, such as @comix. This is important as POLYMOD represents contact patterns in 8 countries in Europe, and contact patterns are known to differ across nations and cultures. + +The covariates used by `conmat` were designed to represent the key features that are typically present in a contact matrix for different settings (work, school, home, other). Including other sources of information that may better describe these contact patterns, such as inter-generational mixing, or differences in school ages of a local demographic, may improve model performance. + +The interface to the model formula in `conmat` is fixed; users cannot change the covariates of the model. This means if there is an unusual structure in their contact data it might not be accurately captured by `conmat`. This fixed formula was a design decision made to focus on the key feature of `conmat`: using just age population data to predict a contact matrix. + +Public health decisions are often based on age specific information, which means the more accurate your age specific models are, the better those decisions are likely to be. This is the first piece of software that will provide appropriate contact matrices for a population, which means more accurate models of disease. + +This work was used as a key input into several models for COVID-19 transmission and control in Australia and contributed to decisions around vaccination policy [@DohertyModelling]. + +Some future directions for this software include: + +* Demonstrate fitting a model to other contact surveys from sources such as the `socialmixr` R package [@socialmixr]. +* Predict to any age brackets - such as monthly ages, for example, 1, 3, 6, month year old infants +* Fitting to multiple contact surveys simultaneously, e.g., POLYMOD and CoMix +* Add uncertainty to estimates +* Add methods for including household size distributions +* Include known household age distributions as offsets in the 'home' setting model, in place of the whole population distribution. + +Software is never finished, and the software in its current format has proven useful for infectious disease modelling. In time we hope it can become more widely used and be useful for more applications in epidemiology and public health. + +# References diff --git a/paper/paper_files/figure-html/autoplot-contacts-1.png b/paper/paper_files/figure-html/autoplot-contacts-1.png new file mode 100644 index 0000000..dbc0871 Binary files /dev/null and b/paper/paper_files/figure-html/autoplot-contacts-1.png differ diff --git a/paper/references.bib b/paper/references.bib new file mode 100644 index 0000000..83d827a --- /dev/null +++ b/paper/references.bib @@ -0,0 +1,123 @@ +@article{prem2017, + title = {Projecting social contact matrices in 152 countries using contact surveys and demographic data}, + volume = {13}, + issn = {1553-7358}, + url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005697}, + doi = {10.1371/journal.pcbi.1005697}, + language = {en}, + number = {9}, + urldate = {2024-05-03}, + journal = {PLOS Computational Biology}, + author = {Prem, Kiesha and Cook, Alex R. and Jit, Mark}, + month = sep, + year = {2017}, + keywords = {Infectious disease epidemiology, Schools, South Africa, Age groups, Asia, Bolivia, Germany, Home education}, + pages = {e1005697}, +} + +@article{prem2021, + title = {Projecting contact matrices in 177 geographical regions: {An} update and comparison with empirical data for the {COVID}-19 era}, + volume = {17}, + issn = {1553-7358}, + shorttitle = {Projecting contact matrices in 177 geographical regions}, + url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009098}, + doi = {10.1371/journal.pcbi.1009098}, + language = {en}, + number = {7}, + urldate = {2024-05-03}, + journal = {PLOS Computational Biology}, + author = {Prem, Kiesha and Zandvoort, Kevin van and Klepac, Petra and Eggo, Rosalind M. and Davies, Nicholas G. and Group, Centre for the Mathematical Modelling of Infectious Diseases COVID-19 Working and Cook, Alex R. and Jit, Mark}, + month = jul, + year = {2021}, + keywords = {Infectious disease epidemiology, COVID 19, Geographical regions, Age groups, Pandemics, Schools, Urban geography, Surveys}, + pages = {e1009098}, +} + +@article{mossong2008, + title = {Social contacts and mixing patterns relevant to the spread of infectious diseases}, + volume = {5}, + issn = {1549-1676}, + url = {https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.0050074}, + doi = {10.1371/journal.pmed.0050074}, + language = {en}, + number = {3}, + urldate = {2024-05-03}, + journal = {PLOS Medicine}, + author = {Mossong, Joël and Hens, Niel and Jit, Mark and Beutels, Philippe and Auranen, Kari and Mikolajczyk, Rafael and Massari, Marco and Salmaso, Stefania and Tomba, Gianpaolo Scalia and Wallinga, Jacco and Heijne, Janneke and Sadkowska-Todys, Malgorzata and Rosinska, Magdalena and Edmunds, W. John}, + month = mar, + year = {2008}, + keywords = {Infectious disease epidemiology, Respiratory infections, Age groups, Epidemiology, Europe, Mathematical models, Surveys, Infectious diseases}, + pages = {e74}, +} + +@dataset{comix, + author = {Jarvis, Christopher and + Coletti, Pietro and + Backer, Jantien and + Munday, James and + Faes, Christel and + Beutels, Philippe and + Althaus, Christian and + Low, Nicola and + Wallinga, Jacco and + Hens, Niel and + Edmunds, John}, + title = {CoMix data (last round in BE, CH, NL and UK)}, + month = may, + year = 2024, + publisher = {Zenodo}, + doi = {10.5281/zenodo.11154066}, + url = {https://doi.org/10.5281/zenodo.11154066} +} + + +@misc{socialmixr, + title = {Socialmixr: social mixing matrices for infectious disease modelling}, + shorttitle = {Socialmixr}, + url = {https://CRAN.R-project.org/package=socialmixr}, + doi = {10.32614/CRAN.package.socialmixr}, + abstract = {Provides methods for sampling contact matrices from diary data for use in infectious disease modelling, as discussed in Mossong et al. (2008) {\textless}doi:10.1371/journal.pmed.0050074{\textgreater}.}, + language = {en}, + urldate = {2024-08-09}, + author = {Funk, Sebastian and Willem, Lander and Gruson, Hugo}, + month = jan, + year = {2018}, +} + + @misc{DohertyModelling, + title = {Doherty Institute - Modelling}, + author = {McVernon, Jodie and + McCaw, James and + Tierney, Nicholas and + Miller, Joel and + Lydeamore, Michael and + Golding, Nick and + Shearer, Freya and + Geard, Nic and + Zachreson Cameron and + Baker, Chris and + Walker, Camelia and + Ross, Joshua and + Wood, James and + Conway, Eamon and + Mueller, Ivo}, + urldate = {2024-08-19}, + month = aug, + url={https://www.doherty.edu.au/our-work/institute-themes/viral-infectious-diseases/covid-19/covid-19-modelling/modelling} +} + +@dataset{syncomat, + author = {Saraswati, Chitra M and + Lydeamore, Michael and + Golding, Nick and + Babu, Aarathy and + Tierney, Nicholas}, + title = {{syncomat: Synthetic Contact Matrices for 200 UN + Countries}}, + month = may, + year = 2024, + publisher = {Zenodo}, + version = {v1.0.0}, + doi = {10.5281/zenodo.11365943}, + url = {https://doi.org/10.5281/zenodo.11365943} +} \ No newline at end of file diff --git a/tests/testthat/_snaps/autoplot/autoplot-all-settinge.new.svg b/tests/testthat/_snaps/autoplot/autoplot-all-settinge.new.svg new file mode 100644 index 0000000..481d5bd --- /dev/null +++ b/tests/testthat/_snaps/autoplot/autoplot-all-settinge.new.svg @@ -0,0 +1,307 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +1 +2 +3 +home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +2 +4 +6 +school + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + + + +0.5 +1.0 +1.5 +2.0 +2.5 +work + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + +1 +2 +3 +4 +other +Setting-specific synthetic contact matrices + + diff --git a/tests/testthat/_snaps/autoplot/autoplot-ngm.new.svg b/tests/testthat/_snaps/autoplot/autoplot-ngm.new.svg new file mode 100644 index 0000000..6da1398 --- /dev/null +++ b/tests/testthat/_snaps/autoplot/autoplot-ngm.new.svg @@ -0,0 +1,302 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +0.2 +0.4 +0.6 +home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + +0.1 +0.2 +school + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + +0.05 +0.10 +0.15 +0.20 +work + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + +0.1 +0.2 +0.3 +0.4 +The number of newly infected individuals for a specified age group in each setting +other +Setting-specific NGM matrices + + diff --git a/tests/testthat/_snaps/autoplot/autoplot-single-setting.new.svg b/tests/testthat/_snaps/autoplot/autoplot-single-setting.new.svg new file mode 100644 index 0000000..822c695 --- /dev/null +++ b/tests/testthat/_snaps/autoplot/autoplot-single-setting.new.svg @@ -0,0 +1,92 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +age_group_to +contacts + + + + + + + + + + + +0.5 +1.0 +1.5 +2.0 +2.5 +Work + + diff --git a/tests/testthat/_snaps/autoplot/autoplot-vaccination.new.svg b/tests/testthat/_snaps/autoplot/autoplot-vaccination.new.svg new file mode 100644 index 0000000..985dfe3 --- /dev/null +++ b/tests/testthat/_snaps/autoplot/autoplot-vaccination.new.svg @@ -0,0 +1,305 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + +0.025 +0.050 +0.075 +0.100 +home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +0.02 +0.04 +0.06 +school + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + +0.004 +0.008 +0.012 +0.016 +work + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +0.01 +0.02 +0.03 +Number of newly infected individuals for age groups, adjusted based on proposed age group vaccination rates +other +Setting-specific vaccination matrices + + diff --git a/tests/testthat/_snaps/autoplot/autoplot.new.svg b/tests/testthat/_snaps/autoplot/autoplot.new.svg new file mode 100644 index 0000000..e654bd0 --- /dev/null +++ b/tests/testthat/_snaps/autoplot/autoplot.new.svg @@ -0,0 +1,317 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + +0.2 +0.3 +0.4 +home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + + + +0.050 +0.075 +0.100 +0.125 +0.150 +school + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + + + +0.050 +0.075 +0.100 +0.125 +0.150 +work + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_to +[0,5) +[5,10) +[10,15) +[15,Inf) +age_group_from +contacts + + + + + + + + + + + +0.050 +0.075 +0.100 +0.125 +0.150 +Relative probability of individuals in an age group infecting an individual in another age group +other +Setting-specific transmission probability matrices + + diff --git a/tests/testthat/_snaps/check-if-data-frame.md b/tests/testthat/_snaps/check-if-data-frame.md index bf7f00d..ee9a05f 100644 --- a/tests/testthat/_snaps/check-if-data-frame.md +++ b/tests/testthat/_snaps/check-if-data-frame.md @@ -8,7 +8,7 @@ Code check_if_data_frame(volcano) Condition - Error in `check_if_data_frame()`: - ! x must be a - i x is + Error: + ! `volcano` must be a + i `volcano` is