diff --git a/NAMESPACE b/NAMESPACE index c1822ea..79ec643 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ exportMethods(assocTestSingle, fitNullModelFastScore, kingToMatrix, makeSparseMatrix, + metaPrepScores, pcair, pcrelate, pcrelateToMatrix) diff --git a/R/meta.R b/R/meta.R new file mode 100644 index 0000000..3440b32 --- /dev/null +++ b/R/meta.R @@ -0,0 +1,255 @@ +# assocMetaSingle() or metaTestSingle() or metaAnalysisSingle() or metaSingle() + +# metaSingle <- function() + +# metaAggregate <- function() + +setGeneric("metaPrepScores", function(gdsobj, ...) standardGeneric("metaPrepScores")) + +setMethod("metaPrepScores", + "SeqVarIterator", + function(gdsobj, null.model, AF.max = 1, MAC.min = 0, + geno.coding=c("additive", "dominant", "recessive"), + sparse=TRUE, imputed=FALSE, DS.min = 0, + male.diploid=TRUE, genome.build=c("hg19", "hg38"), + BPPARAM=bpparam(), verbose=TRUE) { + + # don't return score.cov for block iterator (single variant test) + score.cov <- !is(gdsobj, 'SeqVarBlockIterator') + + # genotype coding + geno.coding <- match.arg(geno.coding) + + # don't use sparse matrices for imputed dosages + if (imputed) sparse <- FALSE + + # Convert old null model format if necessary. + null.model <- .updateNullModelFormat(null.model) + + # coerce null.model if necessary + if (sparse) null.model <- .nullModelAsMatrix(null.model) + + # filter samples to match null model + sample.index <- .setFilterNullModel(gdsobj, null.model, verbose=verbose) + + # get sex for calculating allele freq + sex <- validateSex(gdsobj)[sample.index] + + # check ploidy + if (SeqVarTools:::.ploidy(gdsobj) == 1) male.diploid <- FALSE + + if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores') + + i <- 1 + ITER <- function() { + iterate <- TRUE + if (i > 1) { + iterate <- iterateFilter(gdsobj, verbose=FALSE) + } + i <<- i + 1 + if (!iterate) { + return(NULL) + } + + # variant info + var.info <- variantInfo(gdsobj, alleles=TRUE, expanded=TRUE) + if (nrow(var.info) > 0) { + # chr:pos:ref:alt ID for easy merging across studies + #VARID <- paste(paste0("chr", var.info$chr), var.info$pos, var.info$ref, var.info$alt, sep = ':') + #var.info <- cbind(VARID, var.info[,1:5]) + names(var.info)[names(var.info) == 'ref'] <- 'other.allele' + names(var.info)[names(var.info) == 'alt'] <- 'effect.allele' + } + + # read in genotype data + if (!imputed) { + geno <- expandedAltDosage(gdsobj, use.names=FALSE, sparse=sparse)[sample.index,,drop=FALSE] + } else { + geno <- imputedDosage(gdsobj, use.names=FALSE)[sample.index,,drop=FALSE] + } + + chr <- chromWithPAR(gdsobj, genome.build=genome.build) + + return(list(var.info=var.info, geno=geno, chr=chr)) + } + + # worker function + res <- bpiterate(ITER, .metaPrepScores, BPPARAM=BPPARAM, + sex=sex, null.model=null.model, + AF.max=AF.max, MAC.min=MAC.min, + geno.coding=geno.coding, + sparse=sparse, imputed=imputed, DS.min=DS.min, + male.diploid=male.diploid, + score.cov=score.cov) + .stopOnError(res) + + # prepare output + if(score.cov){ + res1 <- lapply(res, function(x) x[[1]]) + res2 <- lapply(res, function(x) x[[2]]) + rm(res) + + # group names + grnames <- .metaGroupNames(gdsobj) + # variant info + names(res1) <- grnames + #res1 <- as.data.frame(rbindlist(res1, use.names=TRUE, fill=TRUE, idcol = 'group.id')) + # scores.cov + names(res2) <- grnames + list(variants = res1, scores.cov = res2) + + }else{ + out <- as.data.frame(rbindlist(res)) + #out <- out[!duplicated(out)] + #as.data.frame(out) + } + }) + + + +setMethod("metaPrepScores", + "GenotypeIterator", + function(gdsobj, null.model, AF.max = 1, MAC.min = 0, + geno.coding=c("additive", "dominant", "recessive"), + male.diploid=TRUE, + BPPARAM=bpparam(), verbose=TRUE) { + + geno.coding <- match.arg(geno.coding) + + # Convert old null model format if necessary. + null.model <- .updateNullModelFormat(null.model) + + # filter samples to match null model + sample.index <- .sampleIndexNullModel(gdsobj, null.model) + + # get sex for calculating allele freq + sex <- validateSex(gdsobj)[sample.index] + + if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores') + + i <- 1 + ITER <- function() { + iterate <- TRUE + if (i > 1) { + iterate <- GWASTools::iterateFilter(gdsobj) + } + i <<- i + 1 + if (!iterate) { + return(NULL) + } + + var.info <- variantInfo(gdsobj, alleles=TRUE) + + geno <- getGenotypeSelection(gdsobj, scan=sample.index, order="selection", + transpose=TRUE, use.names=FALSE, drop=FALSE) + + return(list(var.info=var.info, geno=geno, chr=var.info$chr)) + } + + # worker function + res <- bpiterate(ITER, .metaPrepScores, BPPARAM=BPPARAM, + sex=sex, null.model=null.model, + AF.max=AF.max, MAC.min=MAC.min, + geno.coding=geno.coding, + sparse=FALSE, imputed=FALSE, DS.min=0, + male.diploid=male.diploid, + score.cov=FALSE) + .stopOnError(res) + + # prepare output + out <- as.data.frame(rbindlist(res)) + }) + + + +.metaPrepScores <- function(x, sex, null.model, AF.max, MAC.min, geno.coding, + sparse, imputed, DS.min, male.diploid, score.cov, ...) { + # prep the geno data + x <- .prepGenoBlock(x, AF.max=AF.max, MAC.min=MAC.min, geno.coding=geno.coding, + imputed=imputed, DS.min=DS.min, sex=sex, male.diploid=male.diploid) + var.info <- x$var.info + n.obs <- x$n.obs + freq <- x$freq + geno <- x$geno + rm(x) + + # mean impute missing values + if (any(n.obs < nrow(geno))) { + geno <- .meanImpute(geno, freq$freq) + } + + if (ncol(geno) == 0){ + res.i <- NULL + } else { + # calc score + G <- .genoAsMatrix(null.model, geno) + score <- as.vector(crossprod(G, null.model$fit$resid.PY)) # G^T P Y + + Gtilde <- calcGtilde(null.model, G) + + if(score.cov){ + # calc score covariance matrix + V <- crossprod(Gtilde) # G^T P G + # pack() stores only one triangle of symmetric matrix + # if matrix is too large, pack will fail + if(nrow(V) < 46340){ + V <- pack(V) + } + score.SE <- sqrt(diag(V)) + #colnames(V) <- rownames(V) <- var.info$VARID + + # # melt the covariance matrix to a data.table + # V <- meltMatrix(GPG, drop.lower = TRUE, drop.diag = FALSE) + # setnames(GPG, c('ID1', 'ID2'), c('VARID1', 'VARID2')) + + # output + res.i <- list(cbind(var.info, n.obs, freq, Score = score, Score.SE = score.SE), V) + + }else{ + # calc score SE only + GPG <- colSums(Gtilde^2) # vector of G^T P G (for each variant) + score.SE <- sqrt(GPG) + + # output + res.i <- cbind(var.info, n.obs, freq, Score = score, Score.SE = score.SE) + } + } + + return(res.i) +} + + +setGeneric(".metaGroupNames", function(gdsobj) standardGeneric(".metaGroupNames")) + +setMethod(".metaGroupNames", + "SeqVarWindowIterator", + function(gdsobj) { + gr <- variantRanges(gdsobj) + grnames <- paste(paste0("chr", as.character(GenomicRanges::seqnames(gr))), + BiocGenerics::start(gr), BiocGenerics::end(gr), sep = "_") + grnames + }) + +setMethod(".metaGroupNames", + "SeqVarRangeIterator", + function(gdsobj) { + # get variant group IDs + gr <- variantRanges(gdsobj) + grnames <- names(gr) + if(is.null(grnames)){ + grnames <- paste(paste0("chr", as.character(GenomicRanges::seqnames(gr))), + BiocGenerics::start(gr), BiocGenerics::end(gr), sep = "_") + } + grnames + }) + +setMethod(".metaGroupNames", + "SeqVarListIterator", + function(gdsobj) { + gr <- variantRanges(gdsobj) + grnames <- names(gr) + if(is.null(grnames)){ + grnames <- seq(1, length(gr)) + } + grnames + }) diff --git a/R/utils.R b/R/utils.R index 425473a..6fd33da 100644 --- a/R/utils.R +++ b/R/utils.R @@ -23,10 +23,15 @@ setMethod("validateSex", setMethod("variantInfo", "GenotypeData", function(gdsobj, alleles=FALSE) { - data.frame(variant.id=getSnpID(gdsobj), + x <- data.frame(variant.id=getSnpID(gdsobj), chr=getChromosome(gdsobj, char=TRUE), pos=getPosition(gdsobj), stringsAsFactors=FALSE) + if (alleles) { + x$effect.allele <- getAlleleA(gdsobj) + x$other.allele <- getAlleleB(gdsobj) + } + x }) setMethod("variantFilter", @@ -37,28 +42,41 @@ setMethod("variantFilter", # function to pre-process genotype data before testing -.prepGenoBlock <- function(x, AF.max=1, geno.coding="additive", imputed=FALSE, - sex=NULL, male.diploid=TRUE) { - +.prepGenoBlock <- function(x, AF.max=1, MAC.min=0, geno.coding="additive", + imputed=FALSE, DS.min=0, sex=NULL, male.diploid=TRUE) { + var.info <- x$var.info geno <- x$geno chr <- x$chr weight <- x$weight # only applies to aggregate tests, NULL otherwise rm(x) - + # take note of number of non-missing samples #n.obs <- colSums(!is.na(geno)) n.obs <- .countNonMissing(geno, MARGIN = 2) - + # allele frequency - freq <- .alleleFreq(geno, chr, sex, male.diploid=male.diploid) - + freq <- .alleleFreq(geno, chr, sex, male.diploid=male.diploid, imputed=imputed) + # filter monomorphic variants keep <- .filterMonomorphic(geno, count=n.obs, freq=freq$freq, imputed=imputed) - + # exclude variants with freq > max keep <- keep & freq$freq <= AF.max - + + if(!imputed){ + # exclude variants with MAC < min + keep <- keep & freq$MAC >= MAC.min + }else{ + # exclude variants with effMAC < min + keep <- keep & freq$effMAC >= MAC.min + } + + if(imputed){ + # exclude variants where the smaller of the ref/alt max dosage < min + keep <- keep & freq$maxDS >= DS.min + } + # exclude variants with weight 0 if (!is.null(weight)) { weight0 <- is.na(weight) | weight == 0 @@ -66,7 +84,7 @@ setMethod("variantFilter", keep <- keep & !weight0 } } - + # recessive or dominant coding if (geno.coding != "additive") { if (geno.coding == "recessive") { @@ -90,14 +108,14 @@ setMethod("variantFilter", geno[geno == 2] <- 1L out.col <- "n.any.eff" } - + # count number of carriers freq[[out.col]] <- colSums(geno, na.rm=TRUE) - + # remove variants which are now monomorphic (all 0s or all 1s) keep <- keep & !(freq[[out.col]] == 0 | freq[[out.col]] == n.obs) } - + if (!all(keep)) { var.info <- var.info[keep,,drop=FALSE] geno <- geno[,keep,drop=FALSE] @@ -105,7 +123,7 @@ setMethod("variantFilter", freq <- freq[keep,,drop=FALSE] weight <- weight[keep] } - + return(list(var.info=var.info, n.obs=n.obs, freq=freq, geno=geno, weight=weight)) } @@ -132,7 +150,7 @@ setMethod("variantFilter", } -.alleleFreq <- function(geno, chr=NULL, sex=NULL, male.diploid=TRUE) { +.alleleFreq <- function(geno, chr=NULL, sex=NULL, male.diploid=TRUE, imputed=FALSE) { if (is.null(sex) | is.null(chr)) { #freq <- 0.5*colMeans(geno, na.rm=TRUE) @@ -140,8 +158,19 @@ setMethod("variantFilter", # nsamp <- colSums(!is.na(geno)) nsamp <- .countNonMissing(geno, MARGIN = 2) freq <- count/(2*nsamp) - mac <- round(pmin(count, 2*nsamp - count)) - return(data.frame(freq=freq, MAC=mac)) + mac <- pmin(count, 2*nsamp - count) + if(imputed){ + # imputation r^2 + var.dosage <- .apply(geno, MARGIN = 2, FUN = var) + r2.dosage <- var.dosage/(2*freq*(1-freq)) + # effective MAC + effMAC <- mac*r2.dosage + # smaller of max dosage for alt and ref alleles + maxDS <- .apply(geno, MARGIN = 2, FUN = function(x){ min(max(x), 2-min(x)) }) + return(data.frame(freq=freq, MAC=round(mac), effMAC=effMAC, maxDS=maxDS)) + }else{ + return(data.frame(freq=freq, MAC=round(mac))) # does MAC need to be rounded? + } } # check chromosome @@ -194,8 +223,44 @@ setMethod("variantFilter", } freq <- count / possible - mac <- round(pmin(count, possible - count)) - data.frame(freq=freq, MAC=mac) + mac <- pmin(count, possible - count) + + if(imputed){ + # imputation r^2 + var.dosage <- .apply(geno, MARGIN = 2, FUN = var) + r2.dosage <- rep(NA, ncol(geno)) + maxDS <- rep(NA, ncol(geno)) + if(any(auto)){ + r2.dosage[auto] <- var.dosage[auto]/(2*freq[auto]*(1-freq[auto])) + maxDS[auto] <- .apply(geno[, auto, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 2-min(x)) }) + } + if(any(X)){ + F.prop <- F.nsamp/(F.nsamp + M.nsamp) + if(male.diploid){ + r2.dosage[X] <- var.dosage[X]/(2*freq[X]*(1-freq[X])*(2 - F.prop)) + maxDS[X] <- .apply(geno[, X, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 2-min(x)) }) + }else{ + r2.dosage[X] <- var.dosage[X]/(freq[X]*(1-freq[X])*(1 + F.prop)) + maxDS.F <- .apply(geno[female, X, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 2-min(x)) }) + maxDS.M <- .apply(geno[male, X, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 1-min(x)) }) + maxDS[X] <- pmin(maxDS.F, maxDS.M) + } + } + if(any(Y)){ + if(male.diploid){ + r2.dosage[Y] <- var.dosage[Y]/(4*freq[Y]*(1-freq[Y])) + maxDS[Y] <- .apply(geno[male, Y, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 2-min(x)) }) + }else{ + r2.dosage[Y] <- var.dosage[Y]/(freq[Y]*(1-freq[Y])) + maxDS[Y] <- .apply(geno[male, Y, drop=FALSE], MARGIN = 2, FUN = function(x){ min(max(x), 1-min(x)) }) + } + } + # effective MAC + effMAC <- mac*r2.dosage + return(data.frame(freq=freq, MAC=round(mac), effMAC=effMAC, maxDS=maxDS)) + }else{ + return(data.frame(freq=freq, MAC=round(mac))) # does this need to be rounded? + } } diff --git a/tests/testthat/test_meta.R b/tests/testthat/test_meta.R new file mode 100644 index 0000000..45348b8 --- /dev/null +++ b/tests/testthat/test_meta.R @@ -0,0 +1,83 @@ +context("meta analysis tests") +library(SeqVarTools) +library(GenomicRanges) +library(Biobase) + +BPPARAM <- BiocParallel::SerialParam() +#BPPARAM <- BiocParallel::MulticoreParam() + +test_that("metaPrepScores - block", { + svd <- .testData() + seqSetFilterChrom(svd, include=1, verbose=FALSE) + iterator <- SeqVarBlockIterator(svd, verbose=FALSE) + nullmod <- fitNullModel(iterator, outcome="outcome", covars=c("sex", "age"), verbose=FALSE) + scores <- metaPrepScores(iterator, nullmod, BPPARAM=BPPARAM, verbose=FALSE) + expect_true(is.data.frame(scores)) + expect_true(all(c("effect.allele", "other.allele") %in% names(scores))) + expect_true(all(c("Score", "Score.SE") %in% names(scores))) + seqClose(svd) +}) + +test_that("metaPrepScores - window", { + svd <- .testData() + seqSetFilterChrom(svd, include=1, verbose=FALSE) + iterator <- SeqVarWindowIterator(svd, windowSize=5e5, windowShift=2.5e5, verbose=FALSE) + nullmod <- fitNullModel(iterator, outcome="outcome", covars=c("sex", "age"), verbose=FALSE) + scores <- metaPrepScores(iterator, nullmod, BPPARAM=BPPARAM, verbose=FALSE) + expect_equal(names(scores), c("variants", "scores.cov")) + nwin <- length(variantRanges(iterator)) + expect_equal(length(scores$variants), nwin) + expect_equal(length(scores$scores.cov), nwin) + expect_true(all(c("effect.allele", "other.allele") %in% names(scores$variants[[1]]))) + expect_true(all(c("Score", "Score.SE") %in% names(scores$variants[[1]]))) + vr <- variantRanges(iterator) + nm <- paste0("chr", seqnames(vr), "_", start(vr), "_", end(vr)) + expect_equal(nm, names(scores$variants)) + expect_equal(nm, names(scores$scores.cov)) + for (i in nwin) expect_equal(nrow(scores$variants[[i]]), nrow(scores$scores.cov[[i]])) + seqClose(svd) +}) + +test_that("metaPrepScores - ranges", { + svd <- .testData() + gr <- GRanges(seqnames=rep(1,3), ranges=IRanges(start=c(1e6, 2e6, 3e6), width=1e6)) + names(gr) <- letters[1:3] + iterator <- SeqVarRangeIterator(svd, variantRanges=gr, verbose=FALSE) + nullmod <- fitNullModel(iterator, outcome="outcome", covars=c("sex", "age"), verbose=FALSE) + scores <- metaPrepScores(iterator, nullmod, BPPARAM=BPPARAM, verbose=FALSE) + expect_equal(length(scores$variants), length(gr)) + expect_equal(length(scores$scores.cov), length(gr)) + expect_equal(names(scores$variants), letters[1:3]) + expect_equal(names(scores$scores.cov), letters[1:3]) + for (i in length(gr)) expect_equal(nrow(scores$variants[[i]]), nrow(scores$scores.cov[[i]])) + seqClose(svd) +}) + +test_that("metaPrepScores - list", { + svd <- .testData() + gr <- GRangesList( + GRanges(seqnames=rep(1,2), ranges=IRanges(start=c(1e6, 3e6), width=1e6)), + GRanges(seqnames=rep(1,2), ranges=IRanges(start=c(3e6, 34e6), width=1e6))) + names(gr) <- letters[1:2] + iterator <- SeqVarListIterator(svd, variantRanges=gr, verbose=FALSE) + nullmod <- fitNullModel(iterator, outcome="outcome", covars=c("sex", "age"), verbose=FALSE) + scores <- metaPrepScores(iterator, nullmod, BPPARAM=BPPARAM, verbose=FALSE) + expect_equal(length(scores$variants), length(gr)) + expect_equal(length(scores$scores.cov), length(gr)) + expect_equal(names(scores$variants), letters[1:2]) + expect_equal(names(scores$scores.cov), letters[1:2]) + for (i in length(gr)) expect_equal(nrow(scores$variants[[i]]), nrow(scores$scores.cov[[i]])) + seqClose(svd) +}) + +test_that("metaPrepScores - GenotypeIterator", { + genoData <- .testGenoData() + covMat <- .testGenoDataGRM(genoData) + iterator <- GenotypeBlockIterator(genoData, snpBlock=1000) + nullmod <- fitNullModel(genoData, outcome="outcome", covars="sex", cov.mat=covMat, verbose=FALSE) + scores <- metaPrepScores(iterator, nullmod, BPPARAM=BPPARAM, verbose=FALSE) + expect_true(is.data.frame(scores)) + expect_true(all(c("effect.allele", "other.allele") %in% names(scores))) + expect_true(all(c("Score", "Score.SE") %in% names(scores))) + close(genoData) +})