diff --git a/.gitignore b/.gitignore index 234f028..0d7f03b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata docs +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index dd2aee6..4c66bd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,8 +11,15 @@ LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Imports: - Matrix, + coda, + fields, + spam, spNNGP Depends: + Matrix, R (>= 2.10) URL: https://changwoo-lee.github.io/bspme/ +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index d75f824..1419102 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,5 @@ -exportPattern("^[[:alpha:]]+") +# Generated by roxygen2: do not edit by hand + +export(blm_me) +export(vecchia_cov) +importFrom(Matrix,Diagonal) diff --git a/R/blm_me.R b/R/blm_me.R new file mode 100644 index 0000000..1f3a4c5 --- /dev/null +++ b/R/blm_me.R @@ -0,0 +1,213 @@ +#' Bayesian normal linear regression models with (spatially) correlated measurement errors +#' +#' This function implements the Bayesian normal linear regression model subject to measurement error with semiconjugate priors. +#' +#' Model: Y = beta0 + beta_x X + beta_z Z + epsilon, epsilon ~ N(0, sigma2y) +#' Measurement error: sigma_Y^2 ~ IG(0.01, 0.01), beta|sigma_Y^2 ~ N(0, sigma_Y^2*diag(var_beta)) +#' +#' @param Y n by 1 matrix, response +#' @param X_mean n by 1 matrix or list of n by 1 matrices of length q, mean of X. +#' @param X_prec n by n matrix or list of n by n matrices of length q, precision matrix of X. Support sparse matrix format from Matrix package. +#' @param Z n by p matrix, covariates without measurement error +#' @param nburn integer, burn-in iteration +#' @param nthin integer, thin-in rate +#' @param nsave integer, number of posterior samples +#' @param prior list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01 +#' @param saveX logical, save X or not +#' +#' @return list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, +#' (2) cputime, cpu time taken in seconds, +#' and optionally (3) X_save, posterior samples of X +#' @export +#' +#' @examples +#' +#' +#' +#' +#' +#' +blm_me <- function(Y, + X_mean, + X_prec, + Z, + nburn = 2000, + nsave = 2000, + nthin = 5, + prior = NULL, + saveX = F){ + + # prior input, default + if(is.null(prior)){ + prior = list(var_beta = 100,a_Y = 0.01, b_Y = 0.01) + } + var_beta = 100 + a_Y = 0.01 + b_Y = 0.01 + + n_y = length(Y) + if(is.vector(Z)) Z = as.matrix(Z) + + if(!is.list(X_mean) & !is.list(X_prec)){ + q = 1 + X_mean = list(X_mean) + X_prec = list(X_prec) + }else if(is.list(X_mean) & is.list(X_prec)){ + q = length(X_mean) + if(length(X_prec)!=q) stop("list length does not match") + }else{ + stop("X_mean is not vector/matrix or list") + } + X_prec_X_mean = list() + X_spamstruct = vector(mode = 'list', length = q) + sparsealgo = rep(T,q) + + for(qq in 1:q){ + X_prec_X_mean[[qq]] = as.numeric(X_prec[[qq]]%*%X_mean[[qq]]) + + if(!("sparseMatrix" %in% is(X_prec[[qq]]))){ + print(paste0(qq,"th X_prec is not a sparse matrix! Using dense algorithm, which may very slow when n is large")) + sparsealgo[qq] = F + }else{ + X_prec[[qq]] = as(as(X_prec[[qq]], "generalMatrix"), "CsparseMatrix") + X_prec[[qq]] = spam::as.spam.dgCMatrix(X_prec[[qq]])# spam object + X_spamstruct[[qq]] = spam::chol(X_prec[[qq]]) + } + } + + X = matrix(0, n_y, q) + for(qq in 1:q) X[,qq] = X_mean[[q]] + if(is.null(names(X_mean))){ + colnames(X) = paste0("exposure.",1:q) + }else{ + colnames(X) = paste0("exposure.",names(X_mean)) + } + + p = ncol(Z) + if(is.null(colnames(Z))){ + colnames(Z) = paste0("covariate.",1:p) + }else{ + colnames(Z) = paste0("covariate.",colnames(Z)) + } + df_temp = as.data.frame(cbind(X,Z)) + D = model.matrix( ~ ., df_temp) + + + # prior + Sigma_beta = diag(var_beta, ncol(D))# 3 coefficients(beta0, beta1, betaz) + Sigma_betainv = solve(Sigma_beta) + + # initialize + sigma2_Y = 1 + beta = rep(0.1, ncol(D)) + + sigma2_save = matrix(0, nsave, 1) + colnames(sigma2_save) = "sigma2_Y" + beta_save = matrix(0, nsave, ncol(D)) + colnames(beta_save) <- colnames(D) + if(saveX){ + X_save = array(0, dim = c(nsave, n_y, q)) + dimnames(X_save)[[3]] = names(X_mean) + } + + YtY = crossprod(Y) + #browser() + # sampler starts + isave = 0 + isnegative = numeric(n_y) + pb <- txtProgressBar(style=3) + t_start = Sys.time() + for(imcmc in 1:(nsave*nthin + nburn)){ + setTxtProgressBar(pb, imcmc/(nsave*nthin + nburn)) + # sample beta + Vbetainv = Sigma_betainv + crossprod(D)/sigma2_Y + betatilde = solve(Vbetainv,crossprod(D,Y)/sigma2_Y) + beta = as.numeric(spam::rmvnorm.prec(1, mu = betatilde, Q = Vbetainv)) + # sample sigma2_Y + SSR = crossprod(Y - D%*%beta) + sigma2_Y = 1/rgamma(1, a_Y + n_y/2, b_Y + SSR/2 ) + + for(qq in 1:q){ + # 1st is intercept + b_G = X_prec_X_mean[[qq]] + beta[qq + 1]/sigma2_Y*(Y-D[,-(qq+1)]%*%beta[-(qq+1)]) + Qtilde = X_prec[[qq]] # dense or spam + if(sparsealgo[qq]){ + Qtilde = Qtilde + spam::diag.spam(beta[qq + 1]^2/sigma2_Y, n_y, n_y) + }else{ + diag(Qtilde) = diag(Qtilde) + beta[qq + 1]^2/sigma2_Y + } + Xstar = spam::rmvnorm.canonical(1, b = as.vector(b_G), + Q = Qtilde,# dense or spam + Rstruct = X_spamstruct[[qq]]) #browser() + if(imcmc > nburn) isnegative = isnegative + (Xstar<0) + D[,(qq+1)] = as.vector(Xstar) + } + + + if((imcmc > nburn)&(imcmc%%nthin==0)){ + isave = isave + 1 + sigma2_save[isave] = sigma2_Y + beta_save[isave,] = beta + if(saveX) X_save[isave,,] = D[,2:(q+1)] + } + } + t_diff = difftime(Sys.time(), t_start, units = "secs") + #print(paste0("Exposure components contains negative vaules total ",sum(isnegative)," times among (# exposures) x n_y x (MCMC iter after burnin) = ",q," x ",n_y," x ",nsave*nthin," instances")) + out = list() + out$posterior = cbind(beta_save, sigma2_save) + out$posterior = coda::mcmc(out$posterior) + out$cputime = t_diff + if(saveX) out$X_save = X_save + out +} + + + + + + + + + + + + + + + + + + + + + + + + + + +# data(mtcars) +# +# fitme = linreg_me(Y = mtcars$mpg, +# X_mean = mtcars$wt, +# X_prec = diag(rep(10, 32)), +# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) +# fitme_conj = linreg_me_conj(Y = mtcars$mpg, +# X_mean = mtcars$wt, +# X_prec = Matrix(diag(rep(10, 32))), +# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) +# +# fitme2 = linreg_me(Y = mtcars$mpg, +# X_mean = list(wt = mtcars$wt, cyl = mtcars$cyl), +# X_prec = list(wt = diag(rep(10, 32)), cyl = diag(rep(10, 32))), +# Z = cbind(mtcars$am, mtcars$vs), var_beta = 10000, nsave = 1000, nthin = 10, saveX = T) + +# +# +# summary(fit$posterior) +# +# summary(fitme$posterior) +# +# summary(fitme2$posterior) +# str(fitme2$X_save) +# plot(fitme2$posterior) diff --git a/R/bspme-package.R b/R/bspme-package.R new file mode 100644 index 0000000..2115ecb --- /dev/null +++ b/R/bspme-package.R @@ -0,0 +1,37 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +## usethis namespace: end +NULL + + +#' Dataset, ozone exposure +#' +#' This is a subset of "ozone2" dataset in fields package, only containing data from monitoring station with no missing values. +#' The 8-hour average (surface) ozone (from 9AM-4PM) measured in parts per billion (PPB) for 67 sites in the midwestern US over the period June 3,1987 through August 31, 1987, 89 days. +#' +#' @format A data frame with 5963 rows and 6 variables: +#' \describe{ +#' \item{date_id}{integer, 1 corresponds to 06/03/1987 and 89 corresponds to 08/31/1987} +#' \item{date}{POIXct, date} +#' \item{station_id}{character, station id} +#' \item{coords_lon}{numeric, longitude of monitoring station} +#' \item{coords_lat}{numeric, latitude of monitoring station} +#' \item{ozone_ppb}{8-hour average surface ozone from 9am-4pm in parts per billion (PPB)} +#' } +"ozone" + + +#' Dataset, simulated health data +#' +#' Simulated health data based on ozone exposures on 06/18/1987. +#' +#' @format A data frame with 3000 rows and 4 variables: +#' \describe{ +#' \item{y}{numeric, simulated health outcome} +#' \item{coords_y_lon}{numeric, simulated health subject longitude} +#' \item{coords_y_lat}{numeric, simulated health subject latitude,} +#' \item{covariate}{numeric, covariate} +#' } +"health_sim" diff --git a/R/hello.R b/R/hello.R deleted file mode 100644 index 3d348f2..0000000 --- a/R/hello.R +++ /dev/null @@ -1,18 +0,0 @@ -# Hello, world! -# -# This is an example function named 'hello' -# which prints 'Hello, world!'. -# -# You can learn more about package authoring with RStudio at: -# -# http://r-pkgs.had.co.nz/ -# -# Some useful keyboard shortcuts for package authoring: -# -# Install Package: 'Cmd + Shift + B' -# Check Package: 'Cmd + Shift + E' -# Test Package: 'Cmd + Shift + T' - -hello <- function() { - print("Hello, world!") -} diff --git a/R/runvecchia.R b/R/vecchia_cov.R similarity index 51% rename from R/runvecchia.R rename to R/vecchia_cov.R index 8585491..64846a0 100644 --- a/R/runvecchia.R +++ b/R/vecchia_cov.R @@ -1,34 +1,34 @@ - -#' Title +#' +#' Vecchia approximation given a covariance matrix +#' +#' This function finds a sparse precision matrix (inverse precision matrix) Q that approximates the covariance matrix Sigma based on Vecchia's approximation. +#' Specifically, the nearest neighbor conditioning set are used based on the coords argument. #' #' @param Sigma n by n covariance matrix #' @param coords n by 2 matrix, coordinates to determine neighborhood #' @param n.neighbors positive integer, number of nearest neighbors to construct knn graph #' @param ord length n vector, ordering of data. If NULL, ordering based on first coordinate will be used. -#' @param KLdiv logical return KL divergence? +#' @param KLdiv logical, return KL divergence? #' -#' @return list of (1) sparse precision matrix Q, (2) ordering, and (3) cpu time in seconds +#' @return list of (1) Q, the sparse precision matrix, +#' (2) ord, the ordering used for Vecchia approximation, +#' (3) cputime, time taken in seconds, +#' and optionally (4) KLdiv, the KL divergence KL(p, ptilde), where p is multivariate normal with mean 0 and covariance Sigma, and ptilde is multivariate normal with mean 0 and precision Q. #' @importFrom Matrix Diagonal #' #' @export #' #' @examples #' +#' n = 1000 +#' coords = cbind(runif(n), runif(n)) +#' Sigma = fields::Exp.cov(coords, aRange = 1) +#' fit5 = vecchia_cov(Sigma, coords, n.neighbors = 5, KLdiv = TRUE) +#' fit5$KLdiv +#' fit10 = vecchia_cov(Sigma, coords, n.neighbors = 10, KLdiv = TRUE) +#' fit10$KLdiv #' -library(fields) -n = 100 -coords = cbind(runif(n), runif(n)) -Sigma = fields::Exp.cov(coords, aRange = 1) - -fit5 = runvecchia(Sigma, coords, n.neighbors = 1, KLdiv = T) - -fit5 = runvecchia(Sigma, coords, n.neighbors = 5, KLdiv = T) -fit$KLdiv -fit10 = runvecchia(Sigma, coords, n.neighbors = 10, KLdiv = T) -fit$KLdiv -#' -#' -runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ +vecchia_cov = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = FALSE){ start.time = Sys.time() n = ncol(Sigma) @@ -47,8 +47,19 @@ runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ nn.indx <- indx$nnIndx n.indx = spNNGP:::mk.n.indx.list(nn.indx, n, n.neighbors) - NN_ind <- t(sapply(1: (n - 1), get_NN_ind, n.indx[-1], n.neighbors)) - browser() + if(n.neighbors > 1){ + NN_ind <- t(sapply(1: (n - 1), get_NN_ind, n.indx[-1], n.neighbors)) + }else if(n.neighbors == 1){ # n.neighbors == 1 + NN_ind <- as.matrix(sapply(1: (n - 1), get_NN_ind, n.indx[-1], n.neighbors)) + }else{ # n.neighbors == 0 + Q = Matrix::Diagonal(n, x=1/diag(Sigma)) + Q = as(Q, "dgCMatrix") + out = list(Q = Q, ord = ord, cputime = difftime(Sys.time(),start.time, units = "secs")) + if(KLdiv){ + out$KLdiv = as.numeric(0.5*(- determinant(Q)$modulus - determinant(Sigma)$modulus)) + } + return(out) + } Sigma = Sigma[ord,ord] # Sigma is now reordered, overwrite to reduce memory usage Nlist = list() @@ -61,7 +72,7 @@ runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ } } # sparse matrix - A = Matrix(0, n, n); + A = Matrix::Matrix(0, n, n); D = numeric(n) #D = Matrix(0, n, n); D[1] = 1 # pseudocode 2 of Finley et al. @@ -72,7 +83,7 @@ runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ D[i+1]= Sigma[i+1, i+1] - sum(Sigma[i+1, Nlist[[i+1]]]*temp) } D = Diagonal(n, x = D) - Q_reordered = t((Matrix::Diagonal(n) - A))%*%solve(D)%*%(Matrix::Diagonal(n) - A) + Q_reordered = Matrix::t((Matrix::Diagonal(n) - A))%*%solve(D)%*%(Matrix::Diagonal(n) - A) Q = Q_reordered[order(ord),order(ord)] end.time = Sys.time() @@ -81,7 +92,7 @@ runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ out$ord = ord out$cputime = difftime(end.time,start.time, units = "secs") if(KLdiv){ - out$KLdiv = as.numeric(0.5*(sum(Q*Sigma[order(ord),order(ord)]) - n - determinant(Q)$modulus - determinant(Sigma)$modulus)) + out$KLdiv = as.numeric(0.5*(sum(Q*Sigma[order(ord),order(ord)]) - n - Matrix::determinant(Q)$modulus - determinant(Sigma)$modulus)) } return(out) } @@ -90,8 +101,10 @@ runvecchia = function(Sigma, coords, n.neighbors, ord = NULL, KLdiv = F){ -#https://github.com/LuZhangstat/NNGP_STAN/blob/master/NNMatrix.R +#' helper function, get nearest neighbor indices +#' source: https://github.com/LuZhangstat/NNGP_STAN/blob/master/NNMatrix.R +#' @noRd get_NN_ind <- function (ind, ind_distM_i, M) { if (ind < M ){l = ind } else {l = M}; D_i <- rep(0, M); diff --git a/README.Rmd b/README.Rmd index 597327e..fecf17f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,5 +1,9 @@ --- -output: github_document +output: + github_document: + pandoc_args: [ + "--mathjax" + ] --- @@ -9,16 +13,23 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", - out.width = "100%" -) + out.width = "100%") ``` + # bspme -bspme is an R package that provides a set of functions for Bayesian spatial measurement error models based on Markov chain Monte Carlo (MCMC) methods. +**bspme** is an R package that provides a set of functions for **B**ayesian **sp**atially correlated **m**easurement **e**rror models, the Bayesian linear models with the presence of (spatailly) correlated measurement error of covariate(s). For illustration, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: +\[ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +\] + +For example in the context of environmental epidemiology, covariate $X_i$ can be an exposure to air pollution of subject $i$ at different locations, $Z_i$ can be demographic information, and $Y_i$ can be associated health outcome. Since exposure $X_i$ are not directly measured at health subject locations but are predicted from air pollution monitoring station locations, this induces spatially correlated measurement error in $X_i$. Also, the uncertainty information should be taken account, which depends on the proximity of the monitoring station to the subject location. One way to incorporate this information is to use a multivariate normal prior on the covariate $(X_1,\dots,X_n)$ (MVN prior approach) adopted by **bspme** package, the **B**ayesian **sp**atially correlated **m**easurement **e**rror models. + +The **bspme** package provides fast, scalable computational tools for Bayesian linear regression models with spatially correlated measurement errors represented as a MVN prior distribution. The posterior inference is carried out using Markov chain Monte Carlo (MCMC) methods. When implemented naively, running the MCMC is impossible because of the $O(n^3)$ computational cost associated with the $n$-dimensional MVN prior, where $n$ is the number of subjects. By adopting sparse MVN prior that has sparse precision matrices based on Vecchia approximation, the **bspme** package provides a fast algorithm to carry out posterior inference for large datasets, with the number of subjects $n$ possibly as big as tens of thousands. ## Installation @@ -29,25 +40,20 @@ You can install the development version of bspme like so: devtools::install_github("changwoo-lee/bspme") ``` -## Example -This is a basic example which shows you how to solve a common problem: +## Functionality -```{r example} -#library(bspme) -## basic example code -``` +| Function | Description | +| ---------------------- | -------------------------------------------------------------------------| +| `blm_me()` | #' Bayesian normal linear regression models with (spatially) correlated measurement errors | +| `bglm_me()` | #' Bayesian generalized linear regression models with (spatially) correlated measurement errors | +| `vecchia_cov()` | Vecchia approximation given a covariance matrix | -What is special about using `README.Rmd` instead of just `README.md`? You can include R chunks like so: -```{r cars} -summary(cars) -``` - -You can also embed plots, for example: -```{r pressure, echo = FALSE} -plot(pressure) -``` +## datasets -In that case, don't forget to commit and push the resulting figure files, so they display on GitHub and CRAN. +| Data call | Description | +| ---------------------- | -------------------------------------------------------------------------| +| `data("ozone")` | ozone exposure data | +| `data("health_sim")` | Simulated health data | diff --git a/README.md b/README.md index 4b6dd16..0c23273 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,39 @@ -bspme is an R package that provides a set of functions for Bayesian -spatial measurement error models based on Markov chain Monte Carlo -(MCMC) methods. +**bspme** is an R package that provides a set of functions for +**B**ayesian **sp**atially correlated **m**easurement **e**rror models, +the Bayesian linear models with the presence of (spatailly) correlated +measurement error of covariate(s). For illustration, consider a normal +linear regression model with intercept, one covariate $X$ with +measurement error, and one covariate $Z$ without measurement error: $$ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +$$ + +For example in the context of environmental epidemiology, covariate +$X_i$ can be an exposure to air pollution of subject $i$ at different +locations, $Z_i$ can be demographic information, and $Y_i$ can be +associated health outcome. Since exposure $X_i$ are not directly +measured at health subject locations but are predicted from air +pollution monitoring station locations, this induces spatially +correlated measurement error in $X_i$. Also, the uncertainty information +should be taken account, which depends on the proximity of the +monitoring station to the subject location. One way to incorporate this +information is to use a multivariate normal prior on the covariate +$(X_1,\dots,X_n)$ (MVN prior approach) adopted by **bspme** package, the +**B**ayesian **sp**atially correlated **m**easurement **e**rror models. + +The **bspme** package provides fast, scalable computational tools for +Bayesian linear regression models with spatially correlated measurement +errors represented as a MVN prior distribution. The posterior inference +is carried out using Markov chain Monte Carlo (MCMC) methods. When +implemented naively, running the MCMC is impossible because of the +$O(n^3)$ computational cost associated with the $n$-dimensional MVN +prior, where $n$ is the number of subjects. By adopting sparse MVN prior +that has sparse precision matrices based on Vecchia approximation, the +**bspme** package provides a fast algorithm to carry out posterior +inference for large datasets, with the number of subjects $n$ possibly +as big as tens of thousands. ## Installation @@ -19,32 +49,17 @@ You can install the development version of bspme like so: devtools::install_github("changwoo-lee/bspme") ``` -## Example - -This is a basic example which shows you how to solve a common problem: - -``` r -#library(bspme) -## basic example code -``` - -What is special about using `README.Rmd` instead of just `README.md`? -You can include R chunks like so: - -``` r -summary(cars) -#> speed dist -#> Min. : 4.0 Min. : 2.00 -#> 1st Qu.:12.0 1st Qu.: 26.00 -#> Median :15.0 Median : 36.00 -#> Mean :15.4 Mean : 42.98 -#> 3rd Qu.:19.0 3rd Qu.: 56.00 -#> Max. :25.0 Max. :120.00 -``` +## Functionality -You can also embed plots, for example: +| Function | Description | +|-----------------|--------------------------------------------------------------------------------------------------| +| `blm_me()` | \#’ Bayesian normal linear regression models with (spatially) correlated measurement errors | +| `bglm_me()` | \#’ Bayesian generalized linear regression models with (spatially) correlated measurement errors | +| `vecchia_cov()` | Vecchia approximation given a covariance matrix | - +## datasets -In that case, don’t forget to commit and push the resulting figure -files, so they display on GitHub and CRAN. +| Data call | Description | +|----------------------|-----------------------| +| `data("ozone")` | ozone exposure data | +| `data("health_sim")` | Simulated health data | diff --git a/data-raw/health_sim.R b/data-raw/health_sim.R new file mode 100644 index 0000000..9d60b31 --- /dev/null +++ b/data-raw/health_sim.R @@ -0,0 +1,83 @@ + +## code to prepare `healthdata_simul` dataset goes here +n_y = 3000 +library(fields) +data(ozone2) +lon.lat = ozone2$lon.lat +set.seed(1234) +coords_test = cbind(runif(2*n_y,min(lon.lat[,1]),max(lon.lat[,1])), runif(2*n_y,min(lon.lat[,2]),max(lon.lat[,2]))) +# extract index so that x is between -88 and -86 +idx1 = which(coords_test[,1] > -88 & coords_test[,1] < -86) +# extract index so that y is between 41.7 and 45 +idx2 = which(coords_test[,2] > 41.7 & coords_test[,2] < 45) +idx = intersect(idx1, idx2) +coords_test = coords_test[-idx, ] + +# extract index so that x is between -84 and -83 +idx1 = which(coords_test[,1] > -84 & coords_test[,1] < -82) +# extract index so that y is between 43.5 and 45 +idx2 = which(coords_test[,2] > 43.5 & coords_test[,2] < 45) +idx = intersect(idx1, idx2) +coords_test = coords_test[-idx, ] + +# extract index so that x is between -84 and -82 +idx1 = which(coords_test[,1] > -83.5 & coords_test[,1] < -82) +# extract index so that y is between 41.5 and 42.5 +idx2 = which(coords_test[,2] > 41.5 & coords_test[,2] < 42.5) +idx = intersect(idx1, idx2) +coords_test = coords_test[-idx, ] +nrow(coords_test) + +coords_test = coords_test[-sample.int(nrow(coords_test), nrow(coords_test) - n_y),] + +# simulate health data based on day 16 +load("data/ozone.rda") +ozone_day16 = ozone[ozone$date_id == 16,] +coords_train = cbind(ozone_day16$coords_lon, ozone_day16$coords_lat) +quilt.plot(coords_train, ozone_day16$ozone_ppb) +sigma <- 15 # standard deviation +a <- 300 # range, in miles +## Matrices needed for prediction + +distmat_xx <- rdist.earth(coords_train) +distmat_xy <- rdist.earth(coords_train, coords_test) +distmat_yy <- rdist.earth(coords_test) + +Sigmaxx = Matern(distmat_xx, smoothness = 0.5, range = a, phi = sigma^2) +Sigmaxy = Matern(distmat_xy, smoothness = 0.5, range = a, phi = sigma^2) +Sigmayy = Matern(distmat_yy, smoothness = 0.5, range = a, phi = sigma^2) + +## GP mean +pred_mean <- t(Sigmaxy) %*% solve(Sigmaxx,ozone_day16$ozone_ppb) +## GP cov +pred_cov <- Sigmayy - t(Sigmaxy) %*% solve(Sigmaxx,Sigmaxy) + +# draw predictive sample +set.seed(1) +true_exposure = mvnfast::rmvn(1, pred_mean, pred_cov) +true_exposure = as.vector(true_exposure) +par(mfrow = c(1,3)) +quilt.plot(coords_test, pred_mean, main = "mean"); US(add= T) +quilt.plot(coords_test, sqrt(diag(pred_cov)), main = "sd"); US(add= T) +quilt.plot(coords_test, true_exposure, main = "draw from MVN"); US(add= T) + +# simulated covariate +z = runif(n_y) + +# generate health data +y = true_exposure + 10*z + rnorm(n_y, sd = 5) + +var(y) +var(true_exposure + 10*z) + +healthdata_simul = data.frame(y = y, + coords_y_lon = coords_test[,1], + coords_y_lat = coords_test[,2], + covariate = z) +hist(true_exposure) +which(true_exposure<0) +which(y<0) +summary(lm(y ~ z + true_exposure)) + + +usethis::use_data(healthdata_simul, overwrite = TRUE) diff --git a/data-raw/ozone.R b/data-raw/ozone.R new file mode 100644 index 0000000..06d8e84 --- /dev/null +++ b/data-raw/ozone.R @@ -0,0 +1,17 @@ +## code to prepare `ozone` dataset goes here + +library(fields) +data(ozone2) +idx = which(colSums(is.na(ozone2$y)) == 0) +station_id = ozone2$station.id[idx] +date = as.POSIXct(paste0("19",ozone2$dates), format = "%Y%m%d", tz = "UTC") +ymat = ozone2$y[,idx] +str(ymat) +ozone = data.frame( date_id = rep(1:89, each = 67), + date = rep(date, each = 67), + station_id = rep(station_id, 89), + coords_lon = rep(ozone2$lon.lat[idx,1], 89), + coords_lat = rep(ozone2$lon.lat[idx,2], 89), + ozone_ppb = as.vector(t(ymat))) + +usethis::use_data(ozone, overwrite = TRUE) diff --git a/data/health_sim.rda b/data/health_sim.rda new file mode 100644 index 0000000..d6163c7 Binary files /dev/null and b/data/health_sim.rda differ diff --git a/data/healthdata_simul.rda b/data/healthdata_simul.rda new file mode 100644 index 0000000..e930504 Binary files /dev/null and b/data/healthdata_simul.rda differ diff --git a/data/ozone.rda b/data/ozone.rda new file mode 100644 index 0000000..4f836a6 Binary files /dev/null and b/data/ozone.rda differ diff --git a/man/blm_me.Rd b/man/blm_me.Rd new file mode 100644 index 0000000..b6e2d46 --- /dev/null +++ b/man/blm_me.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blm_me.R +\name{blm_me} +\alias{blm_me} +\title{Bayesian normal linear regression models with (spatially) correlated measurement errors} +\usage{ +blm_me( + Y, + X_mean, + X_prec, + Z, + nburn = 2000, + nsave = 2000, + nthin = 5, + prior = NULL, + saveX = F +) +} +\arguments{ +\item{Y}{n by 1 matrix, response} + +\item{X_mean}{n by 1 matrix or list of n by 1 matrices of length q, mean of X.} + +\item{X_prec}{n by n matrix or list of n by n matrices of length q, precision matrix of X. Support sparse matrix format from Matrix package.} + +\item{Z}{n by p matrix, covariates without measurement error} + +\item{nburn}{integer, burn-in iteration} + +\item{nsave}{integer, number of posterior samples} + +\item{nthin}{integer, thin-in rate} + +\item{prior}{list of prior parameters, default is var_beta = 100,a_Y = 0.01, b_Y = 0.01} + +\item{saveX}{logical, save X or not} +} +\value{ +list of (1) posterior, the (nsave)x(q+p) matrix of posterior samples as a coda object, +(2) cputime, cpu time taken in seconds, +and optionally (3) X_save, posterior samples of X +} +\description{ +This function implements the Bayesian normal linear regression model subject to measurement error with semiconjugate priors. +} +\details{ +Model: Y = beta0 + beta_x X + beta_z Z + epsilon, epsilon ~ N(0, sigma2y) +Measurement error: sigma_Y^2 ~ IG(0.01, 0.01), beta|sigma_Y^2 ~ N(0, sigma_Y^2*diag(var_beta)) +} diff --git a/man/bspme-package.Rd b/man/bspme-package.Rd new file mode 100644 index 0000000..ce3536a --- /dev/null +++ b/man/bspme-package.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{package} +\name{bspme-package} +\alias{bspme} +\alias{bspme-package} +\title{bspme: Bayesian Spatial Measurement Error Models} +\description{ +Functions for fitting Bayesian linear and genearlized linear models in presence of measurement error of the covariate. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://changwoo-lee.github.io/bspme/} +} + +} +\keyword{internal} diff --git a/man/figures/README-pressure-1.png b/man/figures/README-pressure-1.png deleted file mode 100644 index 7bc4c8f..0000000 Binary files a/man/figures/README-pressure-1.png and /dev/null differ diff --git a/man/health_sim.Rd b/man/health_sim.Rd new file mode 100644 index 0000000..cb5cdfe --- /dev/null +++ b/man/health_sim.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{data} +\name{health_sim} +\alias{health_sim} +\title{Dataset, simulated health data} +\format{ +A data frame with 3000 rows and 4 variables: +\describe{ +\item{y}{numeric, simulated health outcome} +\item{coords_y_lon}{numeric, simulated health subject longitude} +\item{coords_y_lat}{numeric, simulated health subject latitude,} +\item{covariate}{numeric, covariate} +} +} +\usage{ +health_sim +} +\description{ +Simulated health data based on ozone exposures on 06/18/1987. +} +\keyword{datasets} diff --git a/man/hello.Rd b/man/hello.Rd deleted file mode 100644 index 0fa7c4b..0000000 --- a/man/hello.Rd +++ /dev/null @@ -1,12 +0,0 @@ -\name{hello} -\alias{hello} -\title{Hello, World!} -\usage{ -hello() -} -\description{ -Prints 'Hello, world!'. -} -\examples{ -hello() -} diff --git a/man/ozone.Rd b/man/ozone.Rd new file mode 100644 index 0000000..4037bd5 --- /dev/null +++ b/man/ozone.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bspme-package.R +\docType{data} +\name{ozone} +\alias{ozone} +\title{Dataset, ozone exposure} +\format{ +A data frame with 5963 rows and 6 variables: +\describe{ +\item{date_id}{integer, 1 corresponds to 06/03/1987 and 89 corresponds to 08/31/1987} +\item{date}{POIXct, date} +\item{station_id}{character, station id} +\item{coords_lon}{numeric, longitude of monitoring station} +\item{coords_lat}{numeric, latitude of monitoring station} +\item{ozone_ppb}{8-hour average surface ozone from 9am-4pm in parts per billion (PPB)} +} +} +\usage{ +ozone +} +\description{ +This is a subset of "ozone2" dataset in fields package, only containing data from monitoring station with no missing values. +The 8-hour average (surface) ozone (from 9AM-4PM) measured in parts per billion (PPB) for 67 sites in the midwestern US over the period June 3,1987 through August 31, 1987, 89 days. +} +\keyword{datasets} diff --git a/man/vecchia_cov.Rd b/man/vecchia_cov.Rd new file mode 100644 index 0000000..f91941a --- /dev/null +++ b/man/vecchia_cov.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vecchia_cov.R +\name{vecchia_cov} +\alias{vecchia_cov} +\title{Vecchia approximation given a covariance matrix} +\usage{ +vecchia_cov(Sigma, coords, n.neighbors, ord = NULL, KLdiv = FALSE) +} +\arguments{ +\item{Sigma}{n by n covariance matrix} + +\item{coords}{n by 2 matrix, coordinates to determine neighborhood} + +\item{n.neighbors}{positive integer, number of nearest neighbors to construct knn graph} + +\item{ord}{length n vector, ordering of data. If NULL, ordering based on first coordinate will be used.} + +\item{KLdiv}{logical, return KL divergence?} +} +\value{ +list of (1) Q, the sparse precision matrix, +(2) ord, the ordering used for Vecchia approximation, +(3) cputime, time taken in seconds, +and optionally (4) KLdiv, the KL divergence KL(p, ptilde), where p is multivariate normal with mean 0 and covariance Sigma, and ptilde is multivariate normal with mean 0 and precision Q. +} +\description{ +This function finds a sparse precision matrix (inverse precision matrix) Q that approximates the covariance matrix Sigma based on Vecchia's approximation. +Specifically, the nearest neighbor conditioning set are used based on the coords argument. +} +\examples{ + +n = 1000 +coords = cbind(runif(n), runif(n)) +Sigma = fields::Exp.cov(coords, aRange = 1) +fit5 = vecchia_cov(Sigma, coords, n.neighbors = 5, KLdiv = TRUE) +fit5$KLdiv +fit10 = vecchia_cov(Sigma, coords, n.neighbors = 10, KLdiv = TRUE) +fit10$KLdiv + +} diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/ozone_simuldata.Rmd b/vignettes/ozone_simuldata.Rmd new file mode 100644 index 0000000..8795ca9 --- /dev/null +++ b/vignettes/ozone_simuldata.Rmd @@ -0,0 +1,26 @@ +--- +title: "Ozone exposure and simulated health data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ozone_simuldata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +This is a vignette for the `bspme` package using ozone data and simulated health dataset. + +```{r setup} +#library(bspme) +``` + +bspme is an R package that provides a set of functions for Bayesian linear models with the presence of (spatial) measurement error of covariates. For example, consider a normal linear regression model with intercept, one covariate $X$ with measurement error, and one covariate $Z$ without measurement error: +\[ +Y_i = \beta_0 + \beta_x X_i + \beta_z Z_i + \epsilon, \quad \epsilon \stackrel{iid}{\sim} N(0, \sigma^2_Y), \quad i=1,\dots,n, +\]