diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..7fff937 Binary files /dev/null and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index 4c66bd1..38121a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bspme Type: Package Title: Bayesian Spatial Measurement Error Models -Version: 0.1.0 +Version: 0.2.0 Author: Changwoo Lee, Eun Sug Park Maintainer: Changwoo Lee Description: Functions for fitting Bayesian linear and genearlized linear models in presence of measurement error of the covariate. diff --git a/NAMESPACE b/NAMESPACE index 1419102..5aeb339 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(blm_me) +export(blinreg_me) export(vecchia_cov) importFrom(Matrix,Diagonal) diff --git a/R/blm_me.R b/R/blinreg_me.R similarity index 57% rename from R/blm_me.R rename to R/blinreg_me.R index 1f3a4c5..38c2400 100644 --- a/R/blm_me.R +++ b/R/blinreg_me.R @@ -1,13 +1,19 @@ -#' Bayesian normal linear regression models with (spatially) correlated measurement errors +#' Bayesian normal linear regression models with correlated measurement errors #' -#' This function implements the Bayesian normal linear regression model subject to measurement error with semiconjugate priors. -#' -#' Model: Y = beta0 + beta_x X + beta_z Z + epsilon, epsilon ~ N(0, sigma2y) -#' Measurement error: sigma_Y^2 ~ IG(0.01, 0.01), beta|sigma_Y^2 ~ N(0, sigma_Y^2*diag(var_beta)) +#' This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). +#' Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +#' and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +#' The likelihood model is Bayesian normal linear regression, +#' \deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} +#' and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +#' \deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +#' Also, we consider semiconjugate priors for regression coefficients and noise variance; +#' \deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +#' The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. #' #' @param Y n by 1 matrix, response -#' @param X_mean n by 1 matrix or list of n by 1 matrices of length q, mean of X. -#' @param X_prec n by n matrix or list of n by n matrices of length q, precision matrix of X. Support sparse matrix format from Matrix package. +#' @param X_mean n by 1 matrix or list of n by 1 matrices of length q, mean of X \eqn{\mu_X}. +#' @param X_prec n by n matrix or list of n by n matrices of length q, precision matrix of X \eqn{Q_X}. Support sparse matrix format from Matrix package. #' @param Z n by p matrix, covariates without measurement error #' @param nburn integer, burn-in iteration #' @param nthin integer, thin-in rate @@ -22,12 +28,65 @@ #' #' @examples #' +#'\dontrun{ +#' data(ozone) +#' data(health_sim) +#' library(bspme) +#' data(ozone) +#' data(health_sim) +#' library(fields) +#' # exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +#' # using Gaussian process with prior mean zero and exponential covariance kernel +#' # with fixed range 300 (in miles) and stdev 15 (in ppb) +#' +#' ozone16 = ozone[ozone$date_id==16,] +#' +#' Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +#' Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), +#' cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +#' +#' Kxx = Exponential(Dxx, range = 300, phi=15^2) +#' Kyy = Exponential(Dyy, range = 300, phi=15^2) +#' Kxy = Exponential(Dxy, range = 300, phi=15^2) +#' +#' X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +#' X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) #' +#' # visualize +#' par(mfrow = c(1,3)) +#' quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), +#' ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) #' +#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' X_mean, main = "health subjects, mean of exposure"); US(add= T) +#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) #' +#' quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +#' points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) #' +#' # vecchia approximation +#' run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), +#' n.neighbors = 10) +#' Q_sparse = run_vecchia$Q +#' run_vecchia$cputime #' -blm_me <- function(Y, +#' # fit the model +#' fit_me = blm_me(Y = health_sim$Y, +#' X_mean = X_mean, +#' X_prec = Q_sparse, # sparse precision matrix +#' Z = health_sim$Z, +#' nburn = 100, +#' nsave = 1000, +#' nthin = 1) +#' fit_me$cputime +#' summary(fit_me$posterior) +#' library(bayesplot) +#' bayesplot::mcmc_trace(fit_me$posterior) +#' } +#' +blinreg_me <- function(Y, X_mean, X_prec, Z, @@ -160,54 +219,3 @@ blm_me <- function(Y, if(saveX) out$X_save = X_save out } - - - - - - - - - - - - - - - - - - - - - - - - - - -# data(mtcars) -# -# fitme = linreg_me(Y = mtcars$mpg, -# X_mean = mtcars$wt, -# X_prec = diag(rep(10, 32)), -# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) -# fitme_conj = linreg_me_conj(Y = mtcars$mpg, -# X_mean = mtcars$wt, -# X_prec = Matrix(diag(rep(10, 32))), -# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) -# -# fitme2 = linreg_me(Y = mtcars$mpg, -# X_mean = list(wt = mtcars$wt, cyl = mtcars$cyl), -# X_prec = list(wt = diag(rep(10, 32)), cyl = diag(rep(10, 32))), -# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) - -# -# -# summary(fit$posterior) -# -# summary(fitme$posterior) -# -# summary(fitme2$posterior) -# str(fitme2$X_save) -# plot(fitme2$posterior) diff --git a/R/bspme-package.R b/R/bspme-package.R index 2115ecb..829e0ae 100644 --- a/R/bspme-package.R +++ b/R/bspme-package.R @@ -25,13 +25,15 @@ NULL #' Dataset, simulated health data #' -#' Simulated health data based on ozone exposures on 06/18/1987. +#' Simulated health data based on ozone exposures on 06/18/1987. For details, see \code{health_sim.R}. #' -#' @format A data frame with 3000 rows and 4 variables: +#' @format A data frame with n = 3000 rows and 4 variables: #' \describe{ -#' \item{y}{numeric, simulated health outcome} -#' \item{coords_y_lon}{numeric, simulated health subject longitude} -#' \item{coords_y_lat}{numeric, simulated health subject latitude,} -#' \item{covariate}{numeric, covariate} +#' \item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} +#' \item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} +#' \item{coords_y_lon}{n by 1 matrix, numeric, simulated health subject longitude} +#' \item{coords_y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +#' \item{Z}{n by 1 matrix, numeric, covariate} +#' \item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} #' } "health_sim" diff --git a/R/vecchia_cov.R b/R/vecchia_cov.R index 64846a0..44412bb 100644 --- a/R/vecchia_cov.R +++ b/R/vecchia_cov.R @@ -82,9 +82,10 @@ vecchia_cov = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = FALSE){ A[i+1, Nlist[[i+1]]]= temp D[i+1]= Sigma[i+1, i+1] - sum(Sigma[i+1, Nlist[[i+1]]]*temp) } - D = Diagonal(n, x = D) - Q_reordered = Matrix::t((Matrix::Diagonal(n) - A))%*%solve(D)%*%(Matrix::Diagonal(n) - A) + D = Matrix::Diagonal(n, x = D) + Q_reordered = Matrix::t((Matrix::Diagonal(n) - A))%*%Matrix::solve(D)%*%(Matrix::Diagonal(n) - A) Q = Q_reordered[order(ord),order(ord)] + Q = as(Q, "dgCMatrix") end.time = Sys.time() out = list() diff --git a/README.Rmd b/README.Rmd index fecf17f..0225a2b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,10 +22,19 @@ knitr::opts_chunk$set( -**bspme** is an R package that provides a set of functions for **B**ayesian **sp**atially correlated **m**easurement **e**rror models, the Bayesian linear models with the presence of (spatailly) correlated measurement error of covariate(s). For illustration, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: -\[ -Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, -\] +**bspme** is an R package that provides a set of functions for **B**ayesian **sp**atially correlated **m**easurement **e**rror models, the Bayesian linear models with the presence of (spatailly) correlated measurement error of covariate(s). For more details, please see the following paper: + + +> A scalable two-stage Bayesian approach accounting for exposure measurement error in +environmental epidemiology. +> Lee, C. J., Symanski, E., Rammah, A., Kang, D.H., Hopke, P., Park, E.S. (2023+). preprint (url here). + + +For illustration, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: + +$$ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon_i, \quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +$$ For example in the context of environmental epidemiology, covariate $X_i$ can be an exposure to air pollution of subject $i$ at different locations, $Z_i$ can be demographic information, and $Y_i$ can be associated health outcome. Since exposure $X_i$ are not directly measured at health subject locations but are predicted from air pollution monitoring station locations, this induces spatially correlated measurement error in $X_i$. Also, the uncertainty information should be taken account, which depends on the proximity of the monitoring station to the subject location. One way to incorporate this information is to use a multivariate normal prior on the covariate $(X_1,\dots,X_n)$ (MVN prior approach) adopted by **bspme** package, the **B**ayesian **sp**atially correlated **m**easurement **e**rror models. @@ -45,15 +54,19 @@ devtools::install_github("changwoo-lee/bspme") | Function | Description | | ---------------------- | -------------------------------------------------------------------------| -| `blm_me()` | #' Bayesian normal linear regression models with (spatially) correlated measurement errors | -| `bglm_me()` | #' Bayesian generalized linear regression models with (spatially) correlated measurement errors | -| `vecchia_cov()` | Vecchia approximation given a covariance matrix | +| `blinreg_me()` | Bayesian normal linear regression models with (spatially) correlated measurement errors | +| `blogireg_me()` | Bayesian generalized linear regression models with (spatially) correlated measurement errors | +| `vecchia_cov()` | Vecchia approximation given a covariance matrix | ## datasets -| Data call | Description | +| Dataset call | Description | | ---------------------- | -------------------------------------------------------------------------| -| `data("ozone")` | ozone exposure data | -| `data("health_sim")` | Simulated health data | +| `data("ozone")` | 1987 midwest ozone exposure data | +| `data("health_sim")` | Simulated health data | + +## Examples + +Please see the vignette "Ozone exposure and simulated health data". diff --git a/README.md b/README.md index 0c23273..c09fbfa 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,20 @@ **bspme** is an R package that provides a set of functions for **B**ayesian **sp**atially correlated **m**easurement **e**rror models, the Bayesian linear models with the presence of (spatailly) correlated -measurement error of covariate(s). For illustration, consider a normal -linear regression model with intercept, one covariate $X$ with -measurement error, and one covariate $Z$ without measurement error: $$ -Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +measurement error of covariate(s). For more details, please see the +following paper: + +> A scalable two-stage Bayesian approach accounting for exposure +> measurement error in environmental epidemiology. Lee, C. J., Symanski, +> E., Rammah, A., Kang, D.H., Hopke, P., Park, E.S. (2023+). preprint +> (url here). + +For illustration, consider a normal linear regression model with +intercept, one covariate $X$ with measurement error, and one covariate +$Z$ without measurement error: + +$$ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon_i, \quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, $$ For example in the context of environmental epidemiology, covariate @@ -51,15 +61,19 @@ devtools::install_github("changwoo-lee/bspme") ## Functionality -| Function | Description | -|-----------------|--------------------------------------------------------------------------------------------------| -| `blm_me()` | \#’ Bayesian normal linear regression models with (spatially) correlated measurement errors | -| `bglm_me()` | \#’ Bayesian generalized linear regression models with (spatially) correlated measurement errors | -| `vecchia_cov()` | Vecchia approximation given a covariance matrix | +| Function | Description | +|-----------------|----------------------------------------------------------------------------------------------| +| `blinreg_me()` | Bayesian normal linear regression models with (spatially) correlated measurement errors | +| `blogireg_me()` | Bayesian generalized linear regression models with (spatially) correlated measurement errors | +| `vecchia_cov()` | Vecchia approximation given a covariance matrix | ## datasets -| Data call | Description | -|----------------------|-----------------------| -| `data("ozone")` | ozone exposure data | -| `data("health_sim")` | Simulated health data | +| Dataset call | Description | +|----------------------|----------------------------------| +| `data("ozone")` | 1987 midwest ozone exposure data | +| `data("health_sim")` | Simulated health data | + +## Examples + +Please see the vignette “Ozone exposure and simulated health data”. diff --git a/data-raw/health_sim.R b/data-raw/health_sim.R index 9d60b31..d7bfa56 100644 --- a/data-raw/health_sim.R +++ b/data-raw/health_sim.R @@ -4,28 +4,70 @@ n_y = 3000 library(fields) data(ozone2) lon.lat = ozone2$lon.lat -set.seed(1234) -coords_test = cbind(runif(2*n_y,min(lon.lat[,1]),max(lon.lat[,1])), runif(2*n_y,min(lon.lat[,2]),max(lon.lat[,2]))) -# extract index so that x is between -88 and -86 -idx1 = which(coords_test[,1] > -88 & coords_test[,1] < -86) +set.seed(1) +#coords_test = cbind(runif(2*n_y,min(lon.lat[,1]),max(lon.lat[,1])), runif(2*n_y,min(lon.lat[,2]),max(lon.lat[,2]))) + +# Chicago +coords_test1 = cbind(rnorm(1000, -87.73, 0.4), rnorm(1000, 41.89, 0.4)) +# Milwaukee +coords_test2 = cbind(rnorm(500, -87.91, 0.4), rnorm(500, 43.03, 0.4)) +# Indianapolis +coords_test3 = cbind(rnorm(500, -86.16, 0.4), rnorm(500, 39.76)) +# St Louis +coords_test4 = cbind(rnorm(500, -90.20, 0.4), rnorm(500, 38.63, 0.4)) +# Detroit +coords_test5 = cbind(rnorm(500, -83.1, 0.3), rnorm(500, 42.34, 0.3)) +# Louisville +coords_test6 = cbind(rnorm(200, -85.76, 0.3), rnorm(200, 38.25, 0.3)) +# Cincinnati +coords_test7 = cbind(rnorm(500, -84.51, 0.3), rnorm(500, 39.10, 0.3)) +# Columbus +coords_test8 = cbind(rnorm(500, -82.99, 0.4), rnorm(500, 39.96, 0.4)) +# Champaign +coords_test9 = cbind(rnorm(200, -88.24, 0.4), rnorm(200, 40.11, 0.4)) +# grand rapids +coords_test10 = cbind(rnorm(200, -85.67, 0.4), rnorm(200, 42.96, 0.4)) +# lansing +coords_test11 = cbind(rnorm(200, -84.55, 0.4), rnorm(200, 42.73, 0.4)) +# lexington +coords_test12 = cbind(rnorm(200, -84.49, 0.2), rnorm(200, 38.05, 0.2)) +# madison +coords_test13 = cbind(rnorm(100, -89.40, 0.4), rnorm(100, 43.07, 0.4)) +# davenport +coords_test14 = cbind(rnorm(100, -90.58, 0.4), rnorm(100, 41.52, 0.4)) +# peoria +coords_test15 = cbind(rnorm(100, -89.61, 0.2), rnorm(100, 40.69, 0.2)) +# evansville +coords_test16 = cbind(rnorm(100, -87.57, 0.2), rnorm(100, 37.97, 0.2)) +# la crosse +coords_test17 = cbind(rnorm(100, -91.25, 0.2), rnorm(100, 43.81, 0.2)) +# fort wayne +coords_test18 = cbind(rnorm(100, -85.14, 0.2), rnorm(100, 41.08, 0.2)) + +coords_test = rbind(coords_test1, coords_test2, coords_test3, coords_test4, coords_test5, coords_test6, + coords_test7, coords_test8, coords_test9, coords_test10, coords_test11, coords_test12, + coords_test13, coords_test14, coords_test15, coords_test16, coords_test17, coords_test18) + +# extract index so that x is between -87.9 and -86 +idx1 = which(coords_test[,1] > -87.9 & coords_test[,1] < -86) # extract index so that y is between 41.7 and 45 idx2 = which(coords_test[,2] > 41.7 & coords_test[,2] < 45) idx = intersect(idx1, idx2) -coords_test = coords_test[-idx, ] +if(length(idx) > 0) coords_test = coords_test[-idx, ] # extract index so that x is between -84 and -83 idx1 = which(coords_test[,1] > -84 & coords_test[,1] < -82) # extract index so that y is between 43.5 and 45 idx2 = which(coords_test[,2] > 43.5 & coords_test[,2] < 45) idx = intersect(idx1, idx2) -coords_test = coords_test[-idx, ] +if(length(idx) > 0) coords_test = coords_test[-idx, ] # extract index so that x is between -84 and -82 idx1 = which(coords_test[,1] > -83.5 & coords_test[,1] < -82) # extract index so that y is between 41.5 and 42.5 idx2 = which(coords_test[,2] > 41.5 & coords_test[,2] < 42.5) idx = intersect(idx1, idx2) -coords_test = coords_test[-idx, ] +if(length(idx) > 0) coords_test = coords_test[-idx, ] nrow(coords_test) coords_test = coords_test[-sample.int(nrow(coords_test), nrow(coords_test) - n_y),] @@ -56,28 +98,31 @@ pred_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) set.seed(1) true_exposure = mvnfast::rmvn(1, pred_mean, pred_cov) true_exposure = as.vector(true_exposure) -par(mfrow = c(1,3)) -quilt.plot(coords_test, pred_mean, main = "mean"); US(add= T) -quilt.plot(coords_test, sqrt(diag(pred_cov)), main = "sd"); US(add= T) -quilt.plot(coords_test, true_exposure, main = "draw from MVN"); US(add= T) +# par(mfrow = c(1,3)) +# quilt.plot(coords_test, pred_mean, main = "mean"); US(add= T) +# quilt.plot(coords_test, sqrt(diag(pred_cov)), main = "sd"); US(add= T) +# quilt.plot(coords_test, true_exposure, main = "draw from MVN"); US(add= T) # simulated covariate z = runif(n_y) # generate health data -y = true_exposure + 10*z + rnorm(n_y, sd = 5) +Y = 100 -0.5*true_exposure + 5*z + rnorm(n_y, sd = 5) +var(Y) +var(-0.5*true_exposure + 5*z) -var(y) -var(true_exposure + 10*z) +linpred = -5 +0.05*true_exposure + 0.5*z +Ybinary = rbinom(n_y, 1, prob = 1/(1+exp(-linpred))) -healthdata_simul = data.frame(y = y, +health_sim = data.frame(Y = Y, + Ybinary = Ybinary, + #X_mean = pred_mean, + #X_cov = pred_cov, coords_y_lon = coords_test[,1], coords_y_lat = coords_test[,2], - covariate = z) -hist(true_exposure) -which(true_exposure<0) -which(y<0) -summary(lm(y ~ z + true_exposure)) + Z = z, + X_true = true_exposure) + +usethis::use_data(health_sim, overwrite = TRUE) -usethis::use_data(healthdata_simul, overwrite = TRUE) diff --git a/data/health_sim.rda b/data/health_sim.rda index d6163c7..ce5a313 100644 Binary files a/data/health_sim.rda and b/data/health_sim.rda differ diff --git a/data/healthdata_simul.rda b/data/healthdata_simul.rda deleted file mode 100644 index e930504..0000000 Binary files a/data/healthdata_simul.rda and /dev/null differ diff --git a/man/blinreg_me.Rd b/man/blinreg_me.Rd new file mode 100644 index 0000000..fdde82e --- /dev/null +++ b/man/blinreg_me.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blinreg_me.R +\name{blinreg_me} +\alias{blinreg_me} +\title{Bayesian normal linear regression models with correlated measurement errors} +\usage{ +blinreg_me( + Y, + X_mean, + X_prec, + Z, + nburn = 2000, + nsave = 2000, + nthin = 5, + prior = NULL, + saveX = F +) +} +\arguments{ +\item{Y}{n by 1 matrix, response} + +\item{X_mean}{n by 1 matrix or list of n by 1 matrices of length q, mean of X \eqn{\mu_X}.} + +\item{X_prec}{n by n matrix or list of n by n matrices of length q, precision matrix of X \eqn{Q_X}. Support sparse matrix format from Matrix package.} + +\item{Z}{n by p matrix, covariates without measurement error} + +\item{nburn}{integer, burn-in iteration} + +\item{nsave}{integer, number of posterior samples} + +\item{nthin}{integer, thin-in rate} + +\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} + +\item{saveX}{logical, save X or not} +} +\value{ +list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, +(2) cputime, cpu time taken in seconds, +and optionally (3) X_save, posterior samples of X +} +\description{ +This function implements the Bayesian normal linear regression model with correlated measurement error of covariate(s). +Denote \eqn{Y_i} be a continuous response, \eqn{X_i} be a \eqn{q\times 1} covariate of \eqn{i}th observation that is subject to measurement error, +and \eqn{Z_i} be a \eqn{p\times 1} covariate without measurement error. +The likelihood model is Bayesian normal linear regression, +\deqn{Y_i = \beta_0 + X_i^\top \beta_x + Z_i^\top \beta_z + \epsilon_i,\quad \epsilon_i \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n} +and correlated measurement error of \eqn{X_i, i=1,\dots,n} is incorporated into the model as a multivariate normal prior. For example when \eqn{q=1}, we have \eqn{n-}dimensional multivariate normal prior +\deqn{(X_1,\dots,X_n)\sim N_n(\mu_X, Q_X^{-1}).} +Also, we consider semiconjugate priors for regression coefficients and noise variance; +\deqn{\beta_0 \sim N(0, V_\beta), \quad \beta_{x,j} \stackrel{iid}{\sim} N(0, V_\beta), \quad \beta_{z,k} \stackrel{iid}{\sim} N(0, V_\beta), \quad \sigma_Y^2 \sim IG(a_Y, b_Y).} +The function \code{blinreg_me()} implements the Gibbs sampler for posterior inference. Most importantly, it allows sparse matrix input for \eqn{Q_X} for scalable computation. +} +\examples{ + +\dontrun{ +data(ozone) +data(health_sim) +library(bspme) +data(ozone) +data(health_sim) +library(fields) +# exposure mean and covariance at health subject locations with 06/18/1987 data (date id = 16) +# using Gaussian process with prior mean zero and exponential covariance kernel +# with fixed range 300 (in miles) and stdev 15 (in ppb) + +ozone16 = ozone[ozone$date_id==16,] + +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +X_mean = t(Kxy) \%*\% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) \%*\% solve(Kxx, Kxy) + +# visualize +par(mfrow = c(1,3)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "ozone measurements"); US(add= T) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +# vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime + +# fit the model +fit_me = blm_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +fit_me$cputime +summary(fit_me$posterior) +library(bayesplot) +bayesplot::mcmc_trace(fit_me$posterior) +} + +} diff --git a/man/blm_me.Rd b/man/blm_me.Rd deleted file mode 100644 index b6e2d46..0000000 --- a/man/blm_me.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/blm_me.R -\name{blm_me} -\alias{blm_me} -\title{Bayesian normal linear regression models with (spatially) correlated measurement errors} -\usage{ -blm_me( - Y, - X_mean, - X_prec, - Z, - nburn = 2000, - nsave = 2000, - nthin = 5, - prior = NULL, - saveX = F -) -} -\arguments{ -\item{Y}{n by 1 matrix, response} - -\item{X_mean}{n by 1 matrix or list of n by 1 matrices of length q, mean of X.} - -\item{X_prec}{n by n matrix or list of n by n matrices of length q, precision matrix of X. Support sparse matrix format from Matrix package.} - -\item{Z}{n by p matrix, covariates without measurement error} - -\item{nburn}{integer, burn-in iteration} - -\item{nsave}{integer, number of posterior samples} - -\item{nthin}{integer, thin-in rate} - -\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} - -\item{saveX}{logical, save X or not} -} -\value{ -list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, -(2) cputime, cpu time taken in seconds, -and optionally (3) X_save, posterior samples of X -} -\description{ -This function implements the Bayesian normal linear regression model subject to measurement error with semiconjugate priors. -} -\details{ -Model: Y = beta0 + beta_x X + beta_z Z + epsilon, epsilon ~ N(0, sigma2y) -Measurement error: sigma_Y^2 ~ IG(0.01, 0.01), beta|sigma_Y^2 ~ N(0, sigma_Y^2*diag(var_beta)) -} diff --git a/man/health_sim.Rd b/man/health_sim.Rd index cb5cdfe..3f524ed 100644 --- a/man/health_sim.Rd +++ b/man/health_sim.Rd @@ -5,18 +5,20 @@ \alias{health_sim} \title{Dataset, simulated health data} \format{ -A data frame with 3000 rows and 4 variables: +A data frame with n = 3000 rows and 4 variables: \describe{ -\item{y}{numeric, simulated health outcome} -\item{coords_y_lon}{numeric, simulated health subject longitude} -\item{coords_y_lat}{numeric, simulated health subject latitude,} -\item{covariate}{numeric, covariate} +\item{Y}{n by 1 matrix, numeric, simulated continuous health outcome} +\item{Ybinary}{n by 1 matrix, numeric, simulated binary health outcome} +\item{coords_y_lon}{n by 1 matrix, numeric, simulated health subject longitude} +\item{coords_y_lat}{n by 1 matrix, numeric, simulated health subject latitude} +\item{Z}{n by 1 matrix, numeric, covariate} +\item{X_true}{n by 1 matrix, numeric, true ozone exposure used for simulation} } } \usage{ health_sim } \description{ -Simulated health data based on ozone exposures on 06/18/1987. +Simulated health data based on ozone exposures on 06/18/1987. For details, see \code{health_sim.R}. } \keyword{datasets} diff --git a/vignettes/Ozone-exposure-and-health-data-analysis.Rmd b/vignettes/Ozone-exposure-and-health-data-analysis.Rmd new file mode 100644 index 0000000..8eaebfc --- /dev/null +++ b/vignettes/Ozone-exposure-and-health-data-analysis.Rmd @@ -0,0 +1,180 @@ +--- +title: "Ozone exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ozone_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. + +First load the package and data: + + +```r +rm(list=ls()) +library(bspme) +library(fields) +data(ozone) +data(health_sim) +``` + +The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. + + +```r +ozone16 = ozone[ozone$date_id==16,] +par(mfrow = c(1,2)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) +plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) +``` + +
+plot of chunk ozone_eda +

plot of chunk ozone_eda

+
+ +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). + + +```r + +# distance calculation +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +# kernel matrices +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +# posterior predictive mean and covariance +X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +``` + +
+plot of chunk ozone_eda3 +

plot of chunk ozone_eda3

+
+ +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + + +```r +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +#> Time difference of 0.796325 secs +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + + +```r +# fit the model +fit_me = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` + +```r +fit_me$cputime +#> Time difference of 6.17423 secs +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + + +```r +summary(fit_me$posterior) +#> +#> Iterations = 1:1000 +#> Thinning interval = 1 +#> Number of chains = 1 +#> Sample size per chain = 1000 +#> +#> 1. Empirical mean and standard deviation for each variable, +#> plus standard error of the mean: +#> +#> Mean SD Naive SE Time-series SE +#> (Intercept) 100.0234 0.624015 0.0197331 0.0259492 +#> exposure.1 -0.5021 0.007004 0.0002215 0.0003135 +#> covariate.1 4.3374 0.337301 0.0106664 0.0126324 +#> sigma2_Y 24.7826 0.712924 0.0225446 0.0280662 +#> +#> 2. Quantiles for each variable: +#> +#> 2.5% 25% 50% 75% 97.5% +#> (Intercept) 98.8360 99.6014 100.0057 100.4462 101.2774 +#> exposure.1 -0.5163 -0.5066 -0.5021 -0.4974 -0.4882 +#> covariate.1 3.6657 4.1060 4.3402 4.5681 4.9898 +#> sigma2_Y 23.3910 24.3080 24.7964 25.2340 26.1523 +bayesplot::mcmc_trace(fit_me$posterior) +``` + +![plot of chunk unnamed-chunk-3](figure/unnamed-chunk-3-1.png) + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. + + +```r +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` + +```r +fit_me_dense$cputime +#> Time difference of 58.06338 secs +``` + +With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. +If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference. diff --git a/vignettes/figure/.DS_Store b/vignettes/figure/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/vignettes/figure/.DS_Store differ diff --git a/vignettes/figure/ozone_eda-1.png b/vignettes/figure/ozone_eda-1.png new file mode 100644 index 0000000..c6c53af Binary files /dev/null and b/vignettes/figure/ozone_eda-1.png differ diff --git a/vignettes/figure/ozone_eda3-1.png b/vignettes/figure/ozone_eda3-1.png new file mode 100644 index 0000000..c8b8d88 Binary files /dev/null and b/vignettes/figure/ozone_eda3-1.png differ diff --git a/vignettes/figure/unnamed-chunk-3-1.png b/vignettes/figure/unnamed-chunk-3-1.png new file mode 100644 index 0000000..6d1c0e6 Binary files /dev/null and b/vignettes/figure/unnamed-chunk-3-1.png differ diff --git a/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig b/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig new file mode 100644 index 0000000..f9d676c --- /dev/null +++ b/vignettes/ozone-exposure-and-health-data-analysis.Rmd.orig @@ -0,0 +1,139 @@ +--- +title: "Ozone exposure and health data analysis" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ozone_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This is a vignette for the `bspme` package based on ozone exposure data of midwest US, 1987 and simulated health dataset. + +First load the package and data: + +```{r setup} +rm(list=ls()) +library(bspme) +library(fields) +data(ozone) +data(health_sim) +``` + +The ozone data are collected daily at 67 monitoring sites in midwest US during 89 days. We focus on ozone exposure on a specific date 06/18/1987 (date id = 16). We also consider simulated health dataset, with $n=3000$ subject locations and continuous health responses. + +```{r, ozone_eda, fig.width = 10, fig.height=5, out.width ="100%"} +ozone16 = ozone[ozone$date_id==16,] +par(mfrow = c(1,2)) +quilt.plot(cbind(ozone16$coords_lon, ozone16$coords_lat), + ozone16$ozone_ppb, main = "67 ozone monitoring sites"); US(add= T) +plot(health_sim$coords_y_lon, health_sim$coords_y_lat, pch = 19, cex = 0.5, col = "grey", + xlab = "Longitude", ylab = "Latitude", main = "3000 simulated health subject locations"); US(add = T) +``` + +### Obtain posterior predictive distribution at health subject locations + +In this example, we will use Gaussian process to obtain a predictive distribution of $(X_1,\dots,X_n)$, the ozone exposure at the health subject locations. The posterior predictive distribution is n-dimensional multivariate normal $N(\mu_X, Q_X^{-1})$, which will be used as a prior in health model to incorporate measurement error information of ozone exposure. +Specifically, we will use the exponential covariance kernel with fixed range 300 (in miles) and standard deviation 15 (in ppb). + +```{r, ozone_eda3, fig.width = 10, fig.height=5, out.width ="100%"} + +# distance calculation +Dxx = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat)) +Dyy = rdist.earth(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) +Dxy = rdist.earth(cbind(ozone16$coords_lon, ozone16$coords_lat), + cbind(health_sim$coords_y_lon, health_sim$coords_y_lat)) + +# kernel matrices +Kxx = Exponential(Dxx, range = 300, phi=15^2) +Kyy = Exponential(Dyy, range = 300, phi=15^2) +Kxy = Exponential(Dxy, range = 300, phi=15^2) + +# posterior predictive mean and covariance +X_mean = t(Kxy) %*% solve(Kxx, ozone16$ozone_ppb) +X_cov = Kyy - t(Kxy) %*% solve(Kxx, Kxy) + +# visualization of posterior predictive distribution. Black triangle is monitoring station locations. +par(mfrow = c(1,2)) +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + X_mean, main = "health subjects, mean of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) + +quilt.plot(cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + sqrt(diag(X_cov)), main = "health subjects, sd of exposure"); US(add= T) +points(cbind(ozone16$coords_lon, ozone16$coords_lat), pch = 17) +``` + +The covariance of $(X_1,\dots,X_n)$ contains all spatial dependency information up to a second order, but its inverse $Q_X$ become a dense matrix. +When it comes to fitting the Bayesian linear model with measurement error, the naive implementation with n-dimensional multivariate normal prior $N(\mu_X, Q_X^{-1})$ takes cubic time complexity of posterior inference for each MCMC iteration, which becomes infeasible to run when $n$ is large such as tens of thousands. + + +### Run Vecchia approximation to get sparse precision matrix + +We propose to use Vecchia approximation to get a new multivariate normal prior of $(X_1,\dots,X_n)$ with sparse precision matrix, which is crucial for scalable implementation of health model with measurement error, but also at the same time it aims to best preserve the spatial dependency information in the original covariance matrix. + +```{r, vecchia} +#Run the Vecchia approximation to get the sparse precision matrix. + +# Vecchia approximation +run_vecchia = vecchia_cov(X_cov, coords = cbind(health_sim$coords_y_lon, health_sim$coords_y_lat), + n.neighbors = 10) +Q_sparse = run_vecchia$Q +run_vecchia$cputime +``` + + +### Fit bspme with sparse precision matrix + +We can now fit the main function of bspme package with sparse precision matrix. + +```{r, sparse, results = "hide"} +# fit the model +fit_me = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_sparse, # sparse precision matrix + Z = health_sim$Z, + nburn = 100, + nsave = 1000, + nthin = 1) +``` +```{r} +fit_me$cputime +``` +It took less than 10 seconds to (1) run Vecchia approximation and (2) run the MCMC algorithm with total 1100 iterations. + +```{r} +summary(fit_me$posterior) +bayesplot::mcmc_trace(fit_me$posterior) +``` + +### Fit bspme without sparse precision matrix + + +As mentioned before, if precision matrix is dense, the posterior computation becomes infeasible when $n$ becomes large, such as tens of thousands. See the following example when $n=3000$. + +```{r, dense, results = "hide"} +# fit the model, without vecchia approximation +# I will only run 11 iteration for illustration purpose +Q_dense = solve(X_cov) # inverting 3000 x 3000 matrix, takes some time +# run +fit_me_dense = blinreg_me(Y = health_sim$Y, + X_mean = X_mean, + X_prec = Q_dense, # dense precision + Z = health_sim$Z, + nburn = 1, + nsave = 10, + nthin = 1) +``` +```{r} +fit_me_dense$cputime +``` + +With even only 11 MCMC iterations, It took about 1 mins to run the MCMC algorithm. +If number of health subject locations is tens of thousands, the naive algorithm using dense precision matrix becomes infeasible, and Vecchia approximation becomes necesssary to carry out the inference. diff --git a/vignettes/ozone_simuldata.Rmd b/vignettes/ozone_simuldata.Rmd deleted file mode 100644 index 8795ca9..0000000 --- a/vignettes/ozone_simuldata.Rmd +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "Ozone exposure and simulated health data" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{ozone_simuldata} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -This is a vignette for the `bspme` package using ozone data and simulated health dataset. - -```{r setup} -#library(bspme) -``` - -bspme is an R package that provides a set of functions for Bayesian linear models with the presence of (spatial) measurement error of covariates. For example, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: -\[ -Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, -\]