From 2c5f851420b3d9d93a5e247537620d2721c83f91 Mon Sep 17 00:00:00 2001 From: Lilin Yin Date: Mon, 16 Dec 2024 16:54:45 +0800 Subject: [PATCH] optimized computational efficiency --- DESCRIPTION | 4 +- R/MVP.Data.r | 58 +-- R/MVP.FarmCPU.r | 97 +++-- R/MVP.GLM.r | 23 +- R/MVP.K.VanRaden.r | 69 ++-- R/MVP.MLM.r | 20 +- R/MVP.PCA.r | 11 +- R/MVP.Utility.r | 8 +- R/MVP.r | 121 ++++--- R/RcppExports.R | 28 +- README.md | 4 +- inst/extdata/05_mvp/mvp.geno.bin | Bin 151 -> 151 bytes inst/extdata/05_mvp/mvp.geno.desc | 6 +- inst/extdata/06_mvp-impute/mvp.imp.geno.bin | Bin 151 -> 151 bytes inst/extdata/06_mvp-impute/mvp.imp.geno.desc | 6 +- man/MVP.Data.Kin.Rd | 3 + man/MVP.Data.PC.Rd | 3 + man/MVP.Data.Rd | 10 +- man/MVP.Data.impute.Rd | 3 + man/MVP.FarmCPU.Rd | 7 +- man/MVP.GLM.Rd | 7 +- man/MVP.K.VanRaden.Rd | 11 +- man/MVP.MLM.Rd | 10 +- man/MVP.PCA.Rd | 11 +- man/MVP.Rd | 11 +- man/MVP.Version.Rd | 2 +- src/RcppExports.cpp | 65 ++-- src/assoc.cpp | 357 +++++++++++++------ src/data_converter.cpp | 70 ++-- src/impute.cpp | 204 +++++++---- src/kinship.cpp | 260 ++++++++++---- src/mvp_omp.h | 1 - 32 files changed, 959 insertions(+), 531 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2851b80..40d065e 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: rMVP Type: Package Title: Memory-Efficient, Visualize-Enhanced, Parallel-Accelerated GWAS Tool -Version: 1.2.0 -Date: 2024-09-18 +Version: 1.3.0 +Date: 2024-12-12 Authors@R: c( person("Lilin", "Yin", role = "aut"), person("Haohao", "Zhang", role = "aut"), person("Zhenshuang", "Tang", role = "aut"), diff --git a/R/MVP.Data.r b/R/MVP.Data.r index 5aaa77a..2adc7df 100755 --- a/R/MVP.Data.r +++ b/R/MVP.Data.r @@ -12,16 +12,13 @@ #' MVP.Data: To prepare data for MVP package -#' Author: Xiaolei Liu, Lilin Yin and Haohao Zhang -#' Build date: Aug 30, 2016 -#' Last update: Sep 12, 2018 #' #' @param fileMVP Genotype in MVP format #' @param fileVCF Genotype in VCF format #' @param fileHMP Genotype in hapmap format #' @param fileBed Genotype in PLINK binary format #' @param fileInd Individual name file -#' @param fileNum Genotype in numeric format; pure 0, 1, 2 matrix; m * n, m is marker size, n is sample size +#' @param fileNum Genotype in numeric format; pure 0, 1, 2 matrix; m * n or n * m, m is marker size, n is sample size #' @param filePhe Phenotype, the first column is taxa name, the subsequent columns are traits #' @param fileMap SNP map information, there are three columns, including SNP_ID, Chromosome, and Position #' @param fileKin Kinship that represents relationship among individuals, n * n matrix, n is sample size @@ -246,8 +243,8 @@ MVP.Data.VCF2MVP <- function(vcf_file, out='mvp', maxLine = 1e4, type.geno='char # parse genotype bigmat <- filebacked.big.matrix( - nrow = m, - ncol = n, + nrow = n, + ncol = m, type = type.geno, backingfile = backingfile, backingpath = dirname(out), @@ -308,8 +305,8 @@ MVP.Data.Bfile2MVP <- function(bfile, out='mvp', maxLine=1e4, type.geno='char', # parse genotype bigmat <- filebacked.big.matrix( - nrow = m, - ncol = n, + nrow = n, + ncol = m, type = type.geno, backingfile = backingfile, backingpath = dirname(out), @@ -365,8 +362,8 @@ MVP.Data.Hapmap2MVP <- function(hmp_file, out='mvp', maxLine = 1e4, type.geno='c # parse genotype bigmat <- filebacked.big.matrix( - nrow = m, - ncol = n, + nrow = n, + ncol = m, type = type.geno, backingfile = backingfile, backingpath = dirname(out), @@ -432,8 +429,8 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row # define bigmat bigmat <- filebacked.big.matrix( - nrow = m, - ncol = n, + nrow = n, + ncol = m, type = type.geno, backingfile = backingfile, backingpath = dirname(out), @@ -476,15 +473,15 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row len <- length(line) if (len == 0) { break } - line <- do.call(rbind, strsplit(line, '\\s+')) - if (row_names) { line <- line[, 2:ncol(line)]} + line <- do.call(cbind, strsplit(line, '\\s+')) + if (row_names) { line <- line[2:ncol(line), ]} if (transposed) { - bigmat[, (i + 1):(i + ncol(line))] <- line - i <- i + ncol(line) - percent <- 100 * i / n - } else { bigmat[(i + 1):(i + nrow(line)), ] <- line i <- i + nrow(line) + percent <- 100 * i / n + } else { + bigmat[, (i + 1):(i + ncol(line))] <- line + i <- i + ncol(line) percent <- 100 * i / m } @@ -527,8 +524,11 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row #' MVP.Data.MVP2Bfile <- function(bigmat, map, pheno=NULL, out='mvp.plink', threads=1, verbose=TRUE) { t1 <- as.numeric(Sys.time()) + + if(nrow(map) != nrow(bigmat) && nrow(map) != ncol(bigmat)) stop("mismatched number of SNPs between genotype and map.") + mrk_bycol <- nrow(map) == ncol(bigmat) # write bed file - write_bfile(bigmat@address, out, threads=threads, verbose = verbose) + write_bfile(bigmat@address, out, mrkbycol=mrk_bycol, threads=threads, verbose = verbose) # write fam # 1. Family ID ('FID') @@ -537,13 +537,14 @@ MVP.Data.MVP2Bfile <- function(bigmat, map, pheno=NULL, out='mvp.plink', threads # 4. Within-family ID of mother ('0' if mother isn't in dataset) # 5. Sex code ('1' = male, '2' = female, '0' = unknown) # 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control) + n <- ifelse(mrk_bycol, nrow(bigmat), ncol(bigmat)) if (is.null(pheno)) { - ind <- paste0("ind", 1:ncol(bigmat)) - pheno <- rep(-9, ncol(bigmat)) + ind <- paste0("ind", 1:n) + pheno <- rep(-9, n) message("pheno is NULL, automatically named individuals.") } else if (ncol(pheno) == 1) { ind <- pheno[, 1] - pheno <- rep(-9, ncol(bigmat)) + pheno <- rep(-9, n) } else { if (ncol(pheno) > 2) { message("Only the first phenotype is written to the fam file, and the remaining ", ncol(pheno) - 1, " phenotypes are ignored.") @@ -704,6 +705,7 @@ MVP.Data.Map <- function(map, out='mvp', cols=1:5, header=TRUE, sep='\t', verbos #' @param out prefix of output file name #' @param pcs.keep how many PCs to keep #' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost +#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix) #' @param sep seperator for PC file. #' @param cpu the number of cpu #' @param verbose whether to print detail. @@ -727,6 +729,7 @@ MVP.Data.PC <- function( out=NULL, pcs.keep=5, maxLine=10000, + mrk_bycol=TRUE, sep='\t', cpu=1, verbose=TRUE @@ -743,7 +746,7 @@ MVP.Data.PC <- function( } else if (filePC == TRUE) { if(is.null(K)){ geno <- attach.big.matrix(paste0(mvp_prefix, ".geno.desc")) - myPC <- MVP.PCA(M=geno, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu) + myPC <- MVP.PCA(M=geno, mrk_bycol=mrk_bycol, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu) }else{ myPC <- MVP.PCA(K=K, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu) } @@ -773,6 +776,7 @@ MVP.Data.PC <- function( #' @param mvp_prefix Prefix for mvp format files #' @param out prefix of output file name #' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost +#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix) #' @param sep seperator for Kinship file. #' @param cpu the number of cpu #' @param verbose whether to print detail. @@ -794,6 +798,7 @@ MVP.Data.Kin <- function( mvp_prefix='mvp', out=NULL, maxLine=10000, + mrk_bycol=TRUE, sep='\t', cpu=1, verbose=TRUE @@ -809,7 +814,7 @@ MVP.Data.Kin <- function( myKin <- read.big.matrix(fileKin, header = FALSE, type = 'double', sep = sep) } else if (fileKin == TRUE) { geno <- attach.big.matrix(paste0(mvp_prefix, ".geno.desc")) - myKin <- MVP.K.VanRaden(geno, maxLine = maxLine, cpu = cpu, verbose = verbose) + myKin <- MVP.K.VanRaden(geno, mrk_bycol = mrk_bycol, maxLine = maxLine, cpu = cpu, verbose = verbose) } else { stop("ERROR: The value of fileKin is invalid.") } @@ -838,6 +843,7 @@ MVP.Data.Kin <- function( #' #' @param mvp_prefix the prefix of mvp file #' @param out the prefix of output file +#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix) #' @param method 'Major', 'Minor', "Middle" #' @param ncpus number of threads for imputing #' @param verbose whether to print the reminder @@ -849,7 +855,7 @@ MVP.Data.Kin <- function( #' #' MVP.Data.impute(mvpPath, tempfile("outfile"), ncpus=1) #' -MVP.Data.impute <- function(mvp_prefix, out=NULL, method='Major', ncpus=NULL, verbose=TRUE) { +MVP.Data.impute <- function(mvp_prefix, out=NULL, mrk_bycol=TRUE, method='Major', ncpus=NULL, verbose=TRUE) { # input desc <- paste0(mvp_prefix, ".geno.desc") bigmat <- attach.big.matrix(desc) @@ -888,7 +894,7 @@ MVP.Data.impute <- function(mvp_prefix, out=NULL, method='Major', ncpus=NULL, ve file.copy(paste0(mvp_prefix, ".geno.map"), paste0(out, ".geno.map")) } - impute_marker(outmat@address, threads = ncpus, verbose = verbose) + impute_marker(outmat@address, mrkbycol = mrk_bycol, threads = ncpus, verbose = verbose) logging.log("Impute Genotype File is done!\n", verbose = verbose) } diff --git a/R/MVP.FarmCPU.r b/R/MVP.FarmCPU.r index cbde427..ba28518 100755 --- a/R/MVP.FarmCPU.r +++ b/R/MVP.FarmCPU.r @@ -20,7 +20,7 @@ #' @author Xiaolei Liu and Zhiwu Zhang #' #' @param phe phenotype, n by t matrix, n is sample size, t is number of phenotypes -#' @param geno genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA. +#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA. #' @param map SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos #' @param CV covariates, n by c matrix, n is sample size, c is number of covariates #' @param ind_idx the index of effective genotyped individuals @@ -35,6 +35,7 @@ #' @param Prior prior information, four columns, which are SNP_ID, Chr, Pos, P-value #' @param ncpus number of threads used for parallele computation #' @param maxLoop maximum number of iterations +#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param threshold.output only the GWAS results with p-values lower than threshold.output will be output #' @param converge a number, 0 to 1, if selected pseudo QTNs in the last and the second last iterations have a certain probality (the probability is converge) of overlap, the loop will stop #' @param iteration.output whether to output results of all iterations @@ -55,7 +56,7 @@ #' print(dim(phenotype)) #' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP") #' genotype <- attach.big.matrix(genoPath) -#' genotype <- deepcopy(genotype, cols=idx) +#' genotype <- deepcopy(genotype, rows=idx) #' print(dim(genotype)) #' mapPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.map", package = "rMVP") #' map <- read.table(mapPath , head = TRUE) @@ -66,15 +67,20 @@ #' `MVP.FarmCPU` <- function(phe, geno, map, CV=NULL, ind_idx=NULL, mrk_idx=NULL, P=NULL, method.sub="reward", method.sub.final="reward", method.bin=c("EMMA", "static", "FaST-LMM"), bin.size=c(5e5,5e6,5e7), bin.selection=seq(10,100,10), - memo="MVP.FarmCPU", Prior=NULL, ncpus=2, maxLoop=10, + memo="MVP.FarmCPU", Prior=NULL, ncpus=2, maxLoop=10, maxLine=5000, threshold.output=.01, converge=1, iteration.output=FALSE, p.threshold=NA, QTN.threshold=0.01, bound=NULL, verbose=TRUE){ #print("--------------------- Welcome to FarmCPU ----------------------------") - n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx)) if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.") if(sum(is.na(phe[, 2])) != 0) stop("NAs are not allowed in phenotype.") - if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") + if(is.null(ind_idx)){ + if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.") + n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno)) + }else{ + n <- length(ind_idx) + if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") + } echo=TRUE nm=nrow(map) @@ -233,14 +239,10 @@ theCV=CV if(!is.null(myRemove$bin)){ - if(length(myRemove$seqQTN) == 1){ - #myRemove$bin = as.matrix(myRemove$bin) - myRemove$bin = t(myRemove$bin) - } theCV=cbind(CV,myRemove$bin) } - myGLM=FarmCPU.LM(y=phe[,2],GDP=geno,GDP_index=ind_idx,w=theCV,ncpus=ncpus,npc=npc, verbose=verbose) + myGLM=FarmCPU.LM(y=phe[,2],GDP=geno,GDP_index=ind_idx,w=theCV,maxLine=maxLine,ncpus=ncpus,npc=npc, verbose=verbose) if(!is.null(seqQTN)){ if(ncol(myGLM$P) != (npc + length(seqQTN) + 1)) stop("wrong dimensions.") } @@ -474,7 +476,7 @@ #' @author Xiaolei Liu and Zhiwu Zhang #' #' @param Y a n by 2 matrix, the fist column is taxa id and the second is trait -#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA +#' @param GDP genotype #' @param GDP_index the index of effective genotyped individuals #' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos #' @param CV covariates, n by c matrix, n is sample size, c is number of covariates @@ -497,7 +499,12 @@ FarmCPU.BIN <- if(is.null(P)) return(list(bin=NULL,binmap=NULL,seqQTN=NULL)) - if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP)) + if(nrow(Y) == nrow(GDP)){ + if(is.null(GDP_index)) GDP_index=seq(1, nrow(GDP)) + }else{ + if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP)) + } + #Set upper bound for bin selection to squareroot of sample size n=nrow(Y) #bound=round(sqrt(n)/log10(n)) @@ -551,7 +558,11 @@ FarmCPU.BIN <- GP=cbind(GM,P,NA,NA,NA) mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc) seqQTN=which(mySpecify$index==TRUE) - GK=t(GDP[seqQTN, GDP_index]) + if(nrow(Y) == nrow(GDP)){ + GK=GDP[GDP_index, seqQTN] + }else{ + GK=t(GDP[seqQTN, GDP_index]) + } myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method) myREML=myBurger$REMLs myVG=myBurger$vg #it is unused @@ -595,7 +606,11 @@ FarmCPU.BIN <- GP=cbind(GM,P,NA,NA,NA) mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc) seqQTN=which(mySpecify$index==TRUE) - GK=t(GDP[seqQTN,GDP_index]) + if(nrow(Y) == nrow(GDP)){ + GK=GDP[GDP_index, seqQTN] + }else{ + GK=t(GDP[seqQTN, GDP_index]) + } myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method) myREML=myBurger$REMLs myVG=myBurger$vg #it is unused @@ -632,7 +647,11 @@ FarmCPU.BIN <- GP=cbind(GM,P,NA,NA,NA) mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin[ii],inclosure.size=inc[ii]) seqQTN=which(mySpecify$index==TRUE) - GK=t(GDP[seqQTN,GDP_index]) + if(nrow(Y) == nrow(GDP)){ + GK=GDP[GDP_index, seqQTN] + }else{ + GK=t(GDP[seqQTN, GDP_index]) + } myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method) myREML=myBurger$REMLs myVG=myBurger$vg #it is unused @@ -754,8 +773,9 @@ FarmCPU.Specify <- #' #' @param y one column matrix, dependent variable #' @param w covariates, n by c matrix, n is sample size, c is number of covariates -#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA. +#' @param GDP genotype #' @param GDP_index index of effective genotyped individuals +#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param ncpus number of threads used for parallele computation #' @param npc number of covariates without pseudo QTNs #' @param verbose whether to print detail. @@ -767,7 +787,7 @@ FarmCPU.Specify <- #' #' @keywords internal FarmCPU.LM <- - function(y, w=NULL, GDP, GDP_index=NULL, ncpus=2, npc=0, verbose=TRUE){ + function(y, w=NULL, GDP, GDP_index=NULL, maxLine=5000, ncpus=2, npc=0, verbose=TRUE){ #print("FarmCPU.LM started") if(is.null(y)) return(NULL) if(is.null(GDP)) return(NULL) @@ -843,7 +863,7 @@ FarmCPU.LM <- # if(ncol(w) == 50) write.csv(P, "P.csv") mkl_env({ - results <- glm_c(y=y, X=w, iXX = wwi, GDP@address, geno_ind=GDP_index, verbose=verbose, threads=ncpus) + results <- glm_c(y=y, X=w, iXX = wwi, GDP@address, geno_ind=GDP_index, step=maxLine, verbose=verbose, threads=ncpus) }) return(list(P=results[ ,-c(1:3), drop=FALSE], betapred=betapred, sepred=sepred, B=results[ , 1, drop=FALSE], S=results[ , 2, drop=FALSE])) # return(list(P=P, betapred=betapred, B=as.matrix(B), S=S)) @@ -904,7 +924,7 @@ FarmCPU.Burger <- } if(method=="EMMA"){ - K <- MVP.K.VanRaden(M=as.big.matrix(t(theGK)), verbose=FALSE) + K <- MVP.K.VanRaden(M=as.big.matrix(theGK), verbose=FALSE) myEMMAREML <- MVP.EMMA.Vg.Ve(y=matrix(Y[,-1],nrow(Y),1), X=theCV, K=K) REMLs=-2*myEMMAREML$REML delta=myEMMAREML$delta @@ -982,7 +1002,7 @@ FarmCPU.SUB <- #' #' @author Xiaolei Liu and Zhiwu Zhang #' -#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA +#' @param GDP genotype #' @param GDP_index the index of effective genotyped individuals #' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos #' @param seqQTN s by 1 vecter for index of QTN on GM @@ -1001,7 +1021,11 @@ FarmCPU.Remove <- if(is.null(seqQTN))return(list(bin=NULL,binmap=NULL,seqQTN=NULL)) seqQTN=seqQTN[order(seqQTN.p)] - if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP)) + if(nrow(GM) == ncol(GDP)){ + if(is.null(GDP_index)) GDP_index=seq(1, nrow(GDP)) + }else{ + if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP)) + } hugeNum=10e10 n=length(seqQTN) @@ -1025,7 +1049,12 @@ FarmCPU.Remove <- #Set sample ratio=.1 maxNum=100000 - m=nrow(GDP) + if(nrow(GM) == ncol(GDP)){ + m=ncol(GDP) + }else{ + m=nrow(GDP) + } + s=length(GDP_index) sampled=s @@ -1038,9 +1067,17 @@ FarmCPU.Remove <- #This section has problem of turning big.matrix to R matrix #It is OK as x is small if(is.big.matrix(GDP)){ - x=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=index))) + if(nrow(GM) == ncol(GDP)){ + x=deepcopy(GDP,cols=seqQTN,rows=index)[,, drop=FALSE] + }else{ + x=t(deepcopy(GDP,rows=seqQTN,cols=index)[,, drop=FALSE]) + } }else{ - x=t(GDP[seqQTN,index] ) + if(nrow(GM) == ncol(GDP)){ + x=GDP[index, seqQTN, drop=FALSE] + }else{ + x=t(GDP[seqQTN, index, drop=FALSE]) + } } r=cor(as.matrix(x)) @@ -1062,9 +1099,17 @@ FarmCPU.Remove <- #This section has problem of turning big.matrix to R matrix if(is.big.matrix(GDP)){ - bin=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=GDP_index) )) + if(nrow(GM) == ncol(GDP)){ + bin=deepcopy(GDP,cols=seqQTN,rows=GDP_index)[,, drop=FALSE] + }else{ + bin=t(deepcopy(GDP,rows=seqQTN,cols=GDP_index)[,, drop=FALSE]) + } }else{ - bin=t(GDP[seqQTN, GDP_index, drop=FALSE] ) + if(nrow(GM) == ncol(GDP)){ + bin=GDP[GDP_index, seqQTN, drop=FALSE] + }else{ + bin=t(GDP[seqQTN, GDP_index, drop=FALSE]) + } } binmap=GM[seqQTN, , drop=FALSE] diff --git a/R/MVP.GLM.r b/R/MVP.GLM.r index a7d0614..044e6da 100644 --- a/R/MVP.GLM.r +++ b/R/MVP.GLM.r @@ -19,10 +19,11 @@ #' @author Lilin Yin and Xiaolei Liu #' #' @param phe phenotype, n * 2 matrix -#' @param geno Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size +#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size #' @param CV Covariance, design matrix(n * x) for the fixed effects #' @param ind_idx the index of effective genotyped individuals #' @param mrk_idx the index of effective markers used in analysis +#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param cpu number of cpus used for parallel computation #' @param verbose whether to print detail. #' @@ -38,7 +39,7 @@ #' print(dim(phenotype)) #' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP") #' genotype <- attach.big.matrix(genoPath) -#' genotype <- deepcopy(genotype, cols=idx) +#' genotype <- deepcopy(genotype, rows=idx) #' print(dim(genotype)) #' #' glm <- MVP.GLM(phe=phenotype, geno=genotype, cpu=1) @@ -49,18 +50,24 @@ function( phe, geno, CV=NULL, - ind_idx = NULL, + ind_idx=NULL, mrk_idx=NULL, + maxLine=5000, cpu=1, verbose=TRUE ){ - n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx)) - ys <- as.numeric(as.matrix(phe[,2])) - + if(is.null(ind_idx)){ + if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.") + n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno)) + }else{ + n <- length(ind_idx) + if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") + } + if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.") + ys <- as.numeric(as.matrix(phe[,2])) if(sum(is.na(ys)) != 0) stop("NAs are not allowed in phenotype.") - if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") if(is.null(CV)){ X0 <- matrix(1, n) @@ -77,7 +84,7 @@ function( logging.log("scanning...\n", verbose = verbose) mkl_env({ - results <- glm_c(y = ys, X = X0, iXX = iX0X0, geno@address, geno_ind = ind_idx, marker_ind = mrk_idx, verbose = verbose, threads = cpu) + results <- glm_c(y = ys, X = X0, iXX = iX0X0, geno@address, geno_ind = ind_idx, marker_ind = mrk_idx, step = maxLine, verbose = verbose, threads = cpu) }) return(results[, c(1, 2, ncol(results))]) diff --git a/R/MVP.K.VanRaden.r b/R/MVP.K.VanRaden.r index 169bac7..fe80911 100755 --- a/R/MVP.K.VanRaden.r +++ b/R/MVP.K.VanRaden.r @@ -13,13 +13,12 @@ #' Calculate Kinship matrix by VanRaden method #' -#' Build date: Dec 12, 2016 -#' Last update: Dec 12, 2019 -#' -#' @param M Genotype, m * n, m is marker size, n is population size +#' @param M genotype, either m by n or n by m is supportable, m is marker size, n is population size #' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param ind_idx the index of effective genotyped individuals used in analysis #' @param mrk_idx the index of effective markers used in analysis +#' @param mrk_freq the prior calculated major allele frequency (not MAF) for all markers used in analysis +#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. M is a n by m matrix) #' @param cpu the number of cpu #' @param verbose whether to print detail. #' @param checkNA whether to check NA in genotype. @@ -42,6 +41,8 @@ function( maxLine=5000, ind_idx=NULL, mrk_idx=NULL, + mrk_freq = NULL, + mrk_bycol = TRUE, cpu=1, verbose=TRUE, checkNA=TRUE @@ -53,19 +54,32 @@ function( mac <- (!linux) & (!wind) HPCMathLib <- mac | grepl("mkl", sessionInfo()$LAPACK) | grepl("openblas", sessionInfo()$LAPACK) | eval(parse(text = "!inherits(try(Revo.version,silent=TRUE),'try-error')")) - if (!is.big.matrix(M)) stop("Format of Genotype Data must be big.matrix") + if(!is.big.matrix(M)) stop("Format of Genotype Data must be big.matrix") if(checkNA){ - if(hasNA(M@address, geno_ind = ind_idx, threads = cpu)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.") + if(hasNA(M@address, mrkbycol = mrk_bycol, geno_ind = ind_idx, marker_ind = mrk_idx, threads = cpu)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.") } - n <- ifelse(is.null(ind_idx), ncol(M), length(ind_idx)) - m <- ifelse(is.null(mrk_idx), nrow(M), length(mrk_idx)) + + if(!is.null(mrk_freq)){ + if(!is.null(mrk_idx)){ + if(length(mrk_freq) != length(mrk_idx)) stop("mismatched number of markers between provided frequency and index.") + }else{ + if(mrk_bycol){ + if(length(mrk_freq) != ncol(M)) stop("mismatched number of markers between genotype and provided frequency.") + }else{ + if(length(mrk_freq) != nrow(M)) stop("mismatched number of markers between genotype and provided frequency.") + } + } + } + + n <- ifelse(is.null(ind_idx), ifelse(mrk_bycol, nrow(M), ncol(M)), length(ind_idx)) + m <- ifelse(is.null(mrk_idx), ifelse(mrk_bycol, ncol(M), nrow(M)), length(mrk_idx)) # logging.log("Relationship matrix mode in", priority[1], "\n", verbose = verbose) # if(is.null(dim(M))) M <- t(as.matrix(M)) logging.log(paste0("Computing GRM for ", n, " individuals using ", m, " markers with a step of ", maxLine), "\n", verbose = verbose) - K <- try(kin_cal(M@address, geno_ind = ind_idx, marker_ind = mrk_idx, threads = cpu, step = maxLine, verbose = verbose, mkl = HPCMathLib), silent=TRUE) + K <- try(kin_cal(M@address, geno_ind = ind_idx, marker_ind = mrk_idx, marker_freq = mrk_freq, marker_bycol = mrk_bycol, threads = cpu, step = maxLine, verbose = verbose, mkl = HPCMathLib), silent=TRUE) # K <- try(kin_cal_s(M@address, threads = cpu, verbose = verbose, mkl = HPCMathLib), silent=TRUE) if(inherits(K,"try-error")){ @@ -177,40 +191,3 @@ function( logging.log("Deriving relationship matrix successfully", "\n", verbose = verbose); gc() return(K) }#end of MVP.k.VanRaden function - -# #' Calculate Kinship matrix by Blocking strategy -# #' -# #' Build date: Apr 14, 2021 -# #' Last update: Apr 14, 2021 -# #' -# #' @param M Genotype, m * n, m is marker size, n is population size -# #' @param step Number of markers processed at one time -# #' -# #' @return K, n * n matrix -# #' @export -# #' -# #' @examples -# #' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP") -# #' genotype <- attach.big.matrix(genoPath) -# #' print(dim(genotype)) -# #' -# #' K <- MVP.calk(genotype) -# #' -# MVP.calk <- function(M, step = 1000) { -# n_marker = nrow(M) -# n_sample = ncol(M) -# if (step > n_marker) { step = n_marker } -# idx = 0 -# sum = 0 -# K = matrix(0, n_sample, n_sample) -# while(idx < n_marker){ -# G_buffer = M[idx + seq_len(step), ] -# means = rowMeans(G_buffer) -# sum = sum + (0.5 * means) %*% (1 - 0.5 * means) -# K = K + crossprod(G_buffer - means) -# idx = idx + step -# if (idx + step > n_marker) { step = n_marker - idx } -# } -# K = K / (2 * drop(sum)) -# return(K) -# } diff --git a/R/MVP.MLM.r b/R/MVP.MLM.r index 820689c..1c2ad02 100644 --- a/R/MVP.MLM.r +++ b/R/MVP.MLM.r @@ -12,20 +12,18 @@ #' To perform GWAS with GLM and MLM model and get the P value of SNPs -#' -#' Build date: Aug 30, 2016 -#' Last update: Aug 30, 2016 #' #' @author Lilin Yin and Xiaolei Liu #' #' @param phe phenotype, n * 2 matrix -#' @param geno genotype, m * n, m is marker size, n is population size +#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size #' @param K Kinship, Covariance matrix(n * n) for random effects; must be positive semi-definite #' @param eigenK list of eigen Kinship #' @param CV covariates #' @param ind_idx the index of effective genotyped individuals #' @param mrk_idx the index of effective markers used in analysis #' @param REML a list that contains ve and vg +#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param cpu number of cpus used for parallel computation #' @param vc.method the methods for estimating variance component("emma" or "he" or "brent") #' @param verbose whether to print detail. @@ -43,7 +41,7 @@ #' print(dim(phenotype)) #' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP") #' genotype <- attach.big.matrix(genoPath) -#' genotype <- deepcopy(genotype, cols=idx) +#' genotype <- deepcopy(genotype, rows=idx) #' print(dim(genotype)) #' K <- MVP.K.VanRaden(genotype, cpu=1) #' @@ -62,18 +60,24 @@ function( ind_idx=NULL, mrk_idx=NULL, REML=NULL, + maxLine=5000, cpu=1, vc.method=c("BRENT", "EMMA", "HE"), verbose=TRUE ){ + if(is.null(ind_idx)){ + if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.") + n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno)) + }else{ + n <- length(ind_idx) + if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") + } vc.method <- match.arg(vc.method) - n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx)) ys <- as.numeric(as.matrix(phe[,2])) if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.") if(sum(is.na(ys)) != 0) stop("NAs are not allowed in phenotype.") - if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.") if(is.null(K)){ if(vc.method == "EMMA" | vc.method == "he") stop("Kinship must be provided!") @@ -121,7 +125,7 @@ function( logging.log("scanning...\n", verbose = verbose) mkl_env({ - results <- mlm_c(y = ys, X = X0, U = U, vgs = vgs, geno@address, ind_idx, mrk_idx, verbose = verbose, threads = cpu) + results <- mlm_c(y = ys, X = X0, U = U, vgs = vgs, geno@address, ind_idx, mrk_idx, step = maxLine, verbose = verbose, threads = cpu) }) return(results) diff --git a/R/MVP.PCA.r b/R/MVP.PCA.r index 9485120..2d8261d 100755 --- a/R/MVP.PCA.r +++ b/R/MVP.PCA.r @@ -13,15 +13,12 @@ #' Principal Component Analysis #' -#' Build date: Dec 14, 2016 -#' Last update: Oct 29, 2018 -#' @author Xiaolei Liu, Lilin Yin and Haohao Zhang -#' -#' @param M Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size +#' @param M genotype, either m by n or n by m is supportable, m is marker size, n is population size #' @param K kinship matrix #' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost #' @param ind_idx the index of effective genotyped individuals used in analysis #' @param mrk_idx the index of effective markers used in analysis +#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. M is a n by m matrix) #' @param pcs.keep maximum number of PCs for output #' @param cpu the number of cpu #' @param verbose whether to print detail. @@ -42,7 +39,7 @@ #' } #' MVP.PCA <- -function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, pcs.keep=5, cpu=1, verbose=TRUE){ +function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, mrk_bycol=TRUE, pcs.keep=5, cpu=1, verbose=TRUE){ #Data Check if (is.null(M) & is.null(K)) { @@ -58,7 +55,7 @@ function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, pcs.keep=5, } if(is.null(K)){ - K <- MVP.K.VanRaden(M=M, ind_idx = ind_idx, mrk_idx = mrk_idx, maxLine = maxLine, cpu = cpu, verbose = verbose) + K <- MVP.K.VanRaden(M=M, ind_idx = ind_idx, mrk_idx = mrk_idx, mrk_bycol = mrk_bycol, maxLine = maxLine, cpu = cpu, verbose = verbose) }else{ if(!is.null(ind_idx)) K <- K[ind_idx, ind_idx] } diff --git a/R/MVP.Utility.r b/R/MVP.Utility.r index 8e4dd72..d959e65 100755 --- a/R/MVP.Utility.r +++ b/R/MVP.Utility.r @@ -25,12 +25,12 @@ #' #' @examples #' MVP.Version() -MVP.Version <- function(width=60, verbose=TRUE) { +MVP.Version <- function(width=65, verbose=TRUE) { welcome <- "Welcome to MVP" - title <- "A Memory-efficient, Visualization-enhanced, and Parallel-accelerated Tool For GWAS" - authors <- c("Designed and Maintained by Lilin Yin, Haohao Zhang, and Xiaolei Liu", + title <- "an R package for Memory-efficient, Visualization-enhanced and Parallel-accelerated genome-wide association study" + authors <- c("Design & Maintain: Lilin Yin, Haohao Zhang, and Xiaolei Liu", "Contributors: Zhenshuang Tang, Jingya Xu, Dong Yin, Zhiwu Zhang, Xiaohui Yuan, Mengjin Zhu, Shuhong Zhao, Xinyun Li") - contact <- "Contact: xiaoleiliu@mail.hzau.edu.cn" + contact <- "Mailto: xiaoleiliu@mail.hzau.edu.cn, ylilin@mail.hzau.edu.cn" logo_s <- c(" __ __ __ __ ___", "| \\/ | \\ \\ / / | _ \\", "| |\\/| | \\ V / | _/", diff --git a/R/MVP.r b/R/MVP.r index a31f29c..e453aa0 100755 --- a/R/MVP.r +++ b/R/MVP.r @@ -18,13 +18,10 @@ #' Object 3: Estimate variance components using EMMA, FaST-LMM, and HE regression #' Object 4: Generate high-quality figures #' -#' Build date: Aug 30, 2017 -#' Last update: Dec 14, 2018 -#' #' @author Lilin Yin, Haohao Zhang, and Xiaolei Liu #' #' @param phe phenotype, n * 2 matrix, n is sample size -#' @param geno Genotype in bigmatrix format; m * n, m is marker size, n is sample size +#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size #' @param map SNP map information, SNP name, Chr, Pos #' @param K Kinship, Covariance matrix(n * n) for random effects, must be positive semi-definite #' @param nPC.GLM number of PCs added as fixed effects in GLM @@ -49,7 +46,7 @@ #' @param permutation.rep number of permutation replicates #' @param col for color of points in each chromosome on manhattan plot #' @param memo Character. A text marker on output files -#' @param tmppath the path of the temporary file +#' @param outpath the path of the output files #' @param file.output whether to output files or not #' @param file.type figure formats, "jpg", "tiff" #' @param dpi resolution for output figures @@ -57,7 +54,7 @@ #' @param verbose whether to print detail. #' #' @export -#' @return a m * 2 matrix, the first column is the SNP effect, the second column is the P values +#' @return #' Output: MVP.return$map - SNP map information, SNP name, Chr, Pos #' Output: MVP.return$glm.results - p-values obtained by GLM method #' Output: MVP.return$mlm.results - p-values obtained by MLM method @@ -116,14 +113,15 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, } logging.initialize("MVP", logging.outpath) - MVP.Version(width = 60, verbose = verbose) - logging.log("Start:", as.character(Sys.time()), "\n", verbose = verbose) + MVP.Version(width = 65, verbose = verbose) + time_start <- Sys.time() + logging.log("Start:", format(time_start, format = "%F %T %Z"), "\n", verbose = verbose) if ("log" %in% file.output) { logging.log("The log has been output to the file:", get("logging.file", envir = package.env), "\n", verbose = verbose) } vc.method <- match.arg(vc.method) - if (nrow(phe) != ncol(geno)) stop("The number of individuals in phenotype and genotype doesn't match!") - if (nrow(geno) != nrow(map)) stop("The number of markers in genotype and map doesn't match!") + if (nrow(phe) != ncol(geno) & nrow(phe) != nrow(geno)) stop("The number of individuals in phenotype and genotype doesn't match!") + if (nrow(map) != ncol(geno) & nrow(map) != nrow(geno)) stop("The number of markers in genotype and map doesn't match!") if (!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.") #list -> matrix @@ -135,27 +133,39 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, na.index <- NULL if (!is.null(CV.GLM)) { CV.GLM <- as.matrix(CV.GLM) - if (nrow(CV.GLM) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!") + if (nrow(CV.GLM) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!") na.index <- c(na.index, which(is.na(CV.GLM), arr.ind = TRUE)[, 1]) } if (!is.null(CV.MLM)) { CV.MLM <- as.matrix(CV.MLM) - if (nrow(CV.MLM) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!") + if (nrow(CV.MLM) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!") na.index <- c(na.index, which(is.na(CV.MLM), arr.ind = TRUE)[, 1]) } if (!is.null(CV.FarmCPU)) { CV.FarmCPU <- as.matrix(CV.FarmCPU) - if (nrow(CV.FarmCPU) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!") + if (nrow(CV.FarmCPU) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!") na.index <- c(na.index, which(is.na(CV.FarmCPU), arr.ind = TRUE)[, 1]) } na.index <- unique(na.index) #Data information - m <- nrow(geno) - n <- ncol(geno) - logging.log(paste("Input data has", n, "individuals,", m, "markers"), "\n", verbose = verbose) + MrkByCol <- nrow(phe) == nrow(geno) + m <- ifelse(MrkByCol, ncol(geno), nrow(geno)) + n <- nrow(phe) + logging.log(paste("Input data has", n, "individuals and", m, "markers"), "\n", verbose = verbose) + logging.log(paste("Markers are detected to be stored by", ifelse(MrkByCol, "column", "row")), "\n", verbose = verbose) logging.log("Analyzed trait:", colnames(phe)[2], "\n", verbose = verbose) logging.log("Number of threads used:", ncpus, "\n", verbose = verbose) + hpclib <- grepl("mkl", sessionInfo()$LAPACK) | grepl("openblas", sessionInfo()$LAPACK) | !inherits(try(Revo.version, silent=TRUE),"try-error") + if(!hpclib){ + logging.log("No high performance math library detected! The computational efficiency would be greatly reduced\n", verbose = verbose) + }else{ + if(grepl("mkl", sessionInfo()$LAPACK) | !inherits(try(Revo.version, silent=TRUE),"try-error")){ + logging.log("Math Kernel Library is detected, nice job!\n", verbose = verbose) + }else{ + logging.log("OpenBLAS Library is detected, nice job!\n", verbose = verbose) + } + } #remove samples with missing phenotype seqTaxa <- which(!is.na(phe[,2])) @@ -169,15 +179,23 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, if (!is.null(CV.GLM)) { CV.GLM = CV.GLM[seqTaxa, , drop = FALSE] } if (!is.null(CV.MLM)) { CV.MLM = CV.MLM[seqTaxa, , drop = FALSE] } if (!is.null(CV.FarmCPU)) { CV.FarmCPU = CV.FarmCPU[seqTaxa, , drop = FALSE] } - if(length(seqTaxa) < n * 0.85){ - geno <- deepcopy(geno, cols = seqTaxa) + if(length(seqTaxa) < n * 0.8){ + logging.log("Re-build memory-mapping file for remaining individuals", "\n", verbose = verbose) + if(!MrkByCol){ + geno <- deepcopy(geno, cols = seqTaxa) + }else{ + geno <- deepcopy(geno, rows = seqTaxa) + } seqTaxa <- NULL } } - logging.log("Check if NAs exist in genotype...", verbose = verbose) - if(hasNA(geno@address, geno_ind = seqTaxa, threads = ncpus)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.") - logging.log("(OK!)", "\n", verbose = verbose) + logging.log("Check if NAs exist in genotype...", "\n", verbose = verbose) + if(hasNA(geno@address, mrkbycol = MrkByCol, geno_ind = seqTaxa, threads = ncpus)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.") + + logging.log("Calculate allele frequency...", "\n", verbose = verbose) + marker_freq <- BigRowMean(geno@address, MrkByCol, threads = ncpus, geno_ind = seqTaxa) / 2 + map$MAF <- ifelse(marker_freq > 0.5, 1 - marker_freq, marker_freq) #remove SNPs with low MAF geno_marker_index <- NULL @@ -185,10 +203,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, if (!is.null(maf)){ if(length(maf) != 1) stop("maf should be a value") if(maf <= 0 || maf >= 0.5) stop("maf should be at the range of 0-0.5") - logging.log("Calculate allele frequency...", "\n", verbose = verbose) - marker_maf <- BigRowMean(geno@address, threads = ncpus, geno_ind = seqTaxa) / 2 - marker_maf <- ifelse(marker_maf > 0.5, 1 - marker_maf, marker_maf) - geno_marker_index <- which(marker_maf > maf) + geno_marker_index <- which(map$MAF >= maf) if(length(geno_marker_index) == 0) stop(paste("MAFs of all markers are smaller than the threshold", maf)) if(length(geno_marker_index) == 1) stop(paste("only 1 marker left on the given MAF threshold", maf)) if(length(geno_marker_index) == m) geno_marker_index <- NULL @@ -197,12 +212,23 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, logging.log("Total", m - length(geno_marker_index), "markers are removed at MAF threshold", maf, "\n", verbose = verbose) m <- length(geno_marker_index) map_sub <- map[geno_marker_index, ] - if(length(geno_marker_index) < m * 0.85){ + marker_freq <- marker_freq[geno_marker_index] + if(length(geno_marker_index) < m * 0.8){ if(!is.null(seqTaxa)){ - geno <- deepcopy(geno, cols = seqTaxa, rows = geno_marker_index) + logging.log("Re-build memory-mapping file for remaining individuals and markers", "\n", verbose = verbose) + if(MrkByCol){ + geno <- deepcopy(geno, rows = seqTaxa, cols = geno_marker_index) + }else{ + geno <- deepcopy(geno, cols = seqTaxa, rows = geno_marker_index) + } seqTaxa <- NULL }else{ - geno <- deepcopy(geno, rows = geno_marker_index) + logging.log("Re-build memory-mapping file for remaining markers", "\n", verbose = verbose) + if(MrkByCol){ + geno <- deepcopy(geno, cols = geno_marker_index) + }else{ + geno <- deepcopy(geno, rows = geno_marker_index) + } } geno_marker_index <- NULL } @@ -229,11 +255,13 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, if (!is.null(nPC) | "MLM" %in% method) { if (is.null(K)) { K <- MVP.K.VanRaden( - M = geno, - ind_idx = seqTaxa, - mrk_idx = geno_marker_index, - maxLine = maxLine, - cpu = ncpus, + M = geno, + ind_idx = seqTaxa, + mrk_idx = geno_marker_index, + mrk_freq = marker_freq, + mrk_bycol = MrkByCol, + maxLine = maxLine, + cpu = ncpus, verbose = verbose, checkNA = FALSE ) @@ -312,7 +340,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, logging.log("-------------------------GWAS Start-------------------------", "\n", verbose = verbose) if (glm.run) { logging.log("General Linear Model (GLM) Start...", "\n", verbose = verbose) - glm.results <- MVP.GLM(phe=phe, geno=geno, CV=CV.GLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus, verbose = verbose);gc() + glm.results <- MVP.GLM(phe=phe, geno=geno, CV=CV.GLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, cpu=ncpus, verbose = verbose);gc() colnames(glm.results) <- c("Effect", "SE", paste(colnames(phe)[2],"GLM",sep=".")) z = glm.results[, 1]/glm.results[, 2] lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE) @@ -327,7 +355,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, if (mlm.run) { logging.log("Mixed Linear Model (MLM) Start...", "\n", verbose = verbose) - mlm.results <- MVP.MLM(phe=phe, geno=geno, K=K, eigenK=eigenK, CV=CV.MLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus, vc.method=vc.method, verbose = verbose);gc() + mlm.results <- MVP.MLM(phe=phe, geno=geno, K=K, eigenK=eigenK, CV=CV.MLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, cpu=ncpus, vc.method=vc.method, verbose = verbose);gc() colnames(mlm.results) <- c("Effect", "SE", paste(colnames(phe)[2],"MLM",sep=".")) z = mlm.results[, 1]/mlm.results[, 2] lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE) @@ -342,7 +370,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, if (farmcpu.run) { logging.log("FarmCPU Start...", "\n", verbose = verbose) - farmcpu.results <- MVP.FarmCPU(phe=phe, geno=geno, map=map[,1:3], CV=CV.FarmCPU, ind_idx=seqTaxa, mrk_idx=geno_marker_index, ncpus=ncpus, memo="MVP.FarmCPU", p.threshold=p.threshold, QTN.threshold=QTN.threshold, method.bin=method.bin, bin.size=bin.size, bin.selection=bin.selection, maxLoop=maxLoop, verbose = verbose) + farmcpu.results <- MVP.FarmCPU(phe=phe, geno=geno, map=map[,1:3], CV=CV.FarmCPU, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, ncpus=ncpus, memo="MVP.FarmCPU", p.threshold=p.threshold, QTN.threshold=QTN.threshold, method.bin=method.bin, bin.size=bin.size, bin.selection=bin.selection, maxLoop=maxLoop, verbose = verbose) colnames(farmcpu.results) <- c("Effect", "SE", paste(colnames(phe)[2],"FarmCPU",sep=".")) z = farmcpu.results[, 1]/farmcpu.results[, 2] lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE) @@ -366,12 +394,12 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, myY.shuffle = phe myY.shuffle[,2] = myY.shuffle[index.shuffle,2] #GWAS using t.test... - myPermutation = MVP.GLM(phe=myY.shuffle[,c(1,2)], geno=geno, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus) + myPermutation = MVP.GLM(phe=myY.shuffle[,c(1,2)], geno=geno, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine=maxLine, cpu=ncpus) pvalue = min(myPermutation[,3],na.rm=TRUE) if(i==1){ - pvalue.final=pvalue - }else{ - pvalue.final=c(pvalue.final,pvalue) + pvalue.final=pvalue + }else{ + pvalue.final=c(pvalue.final,pvalue) } }#end of permutation.rep permutation.cutoff = sort(pvalue.final)[ceiling(permutation.rep*0.05)] @@ -451,11 +479,20 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL, ) } } - now <- Sys.time() + time_end <- Sys.time() if (length(file.output) > 0) { logging.log("Results are stored at Working Directory:", outpath, "\n", verbose = verbose) } - logging.log("End:", as.character(now), "\n", verbose = verbose) + logging.log("End:", format(time_end, format = "%F %T %Z"), "\n", verbose = verbose) + time_diff <- as.numeric(time_end) - as.numeric(time_start) + h <- time_diff %/% 3600 + m <- (time_diff %% 3600) %/% 60 + s <- ((time_diff %% 3600) %% 60) + index <- which(c(h, m, s) != 0) + num <- c(h, m, s)[index] + num <- round(num, 0) + char <- c("h", "m", "s")[index] + logging.log("Total running time:", paste(num, char, sep="", collapse=""), "\n", verbose = verbose) print_accomplished(width = 60, verbose = verbose) return(invisible(MVP.return)) diff --git a/R/RcppExports.R b/R/RcppExports.R index 1b2c695..6cda37c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,12 +5,12 @@ getRow <- function(pBigMat, row) { .Call(`_rMVP_getRow`, pBigMat, row) } -glm_c <- function(y, X, iXX, pBigMat, geno_ind = NULL, marker_ind = NULL, verbose = TRUE, threads = 0L) { - .Call(`_rMVP_glm_c`, y, X, iXX, pBigMat, geno_ind, marker_ind, verbose, threads) +glm_c <- function(y, X, iXX, pBigMat, geno_ind = NULL, marker_ind = NULL, step = 10000L, verbose = TRUE, threads = 0L) { + .Call(`_rMVP_glm_c`, y, X, iXX, pBigMat, geno_ind, marker_ind, step, verbose, threads) } -mlm_c <- function(y, X, U, vgs, pBigMat, geno_ind = NULL, marker_ind = NULL, verbose = TRUE, threads = 0L) { - .Call(`_rMVP_mlm_c`, y, X, U, vgs, pBigMat, geno_ind, marker_ind, verbose, threads) +mlm_c <- function(y, X, U, vgs, pBigMat, geno_ind = NULL, marker_ind = NULL, step = 10000L, verbose = TRUE, threads = 0L) { + .Call(`_rMVP_mlm_c`, y, X, U, vgs, pBigMat, geno_ind, marker_ind, step, verbose, threads) } vcf_parser_map <- function(vcf_file, out) { @@ -33,8 +33,8 @@ numeric_scan <- function(num_file) { .Call(`_rMVP_numeric_scan`, num_file) } -write_bfile <- function(pBigMat, bed_file, threads = 0L, verbose = TRUE) { - invisible(.Call(`_rMVP_write_bfile`, pBigMat, bed_file, threads, verbose)) +write_bfile <- function(pBigMat, bed_file, mrkbycol = TRUE, threads = 0L, verbose = TRUE) { + invisible(.Call(`_rMVP_write_bfile`, pBigMat, bed_file, mrkbycol, threads, verbose)) } read_bfile <- function(bed_file, pBigMat, maxLine, threads = 0L, verbose = TRUE) { @@ -53,16 +53,16 @@ geninv <- function(GG) { .Call(`_rMVP_geninv`, GG) } -impute_marker <- function(pBigMat, threads = 0L, verbose = TRUE) { - invisible(.Call(`_rMVP_impute_marker`, pBigMat, threads, verbose)) +impute_marker <- function(pBigMat, mrkbycol = TRUE, threads = 0L, verbose = TRUE) { + invisible(.Call(`_rMVP_impute_marker`, pBigMat, mrkbycol, threads, verbose)) } -hasNA <- function(pBigMat, geno_ind = NULL, threads = 1L) { - .Call(`_rMVP_hasNA`, pBigMat, geno_ind, threads) +hasNA <- function(pBigMat, mrkbycol = TRUE, geno_ind = NULL, marker_ind = NULL, threads = 1L) { + .Call(`_rMVP_hasNA`, pBigMat, mrkbycol, geno_ind, marker_ind, threads) } -BigRowMean <- function(pBigMat, threads = 0L, geno_ind = NULL) { - .Call(`_rMVP_BigRowMean`, pBigMat, threads, geno_ind) +BigRowMean <- function(pBigMat, mrkbycol = TRUE, threads = 0L, geno_ind = NULL) { + .Call(`_rMVP_BigRowMean`, pBigMat, mrkbycol, threads, geno_ind) } kin_cal_m <- function(pBigMat, threads = 0L, verbose = TRUE) { @@ -73,7 +73,7 @@ kin_cal_s <- function(pBigMat, threads = 0L, mkl = FALSE, verbose = TRUE) { .Call(`_rMVP_kin_cal_s`, pBigMat, threads, mkl, verbose) } -kin_cal <- function(pBigMat, geno_ind = NULL, marker_ind = NULL, threads = 0L, step = 10000L, mkl = FALSE, verbose = TRUE) { - .Call(`_rMVP_kin_cal`, pBigMat, geno_ind, marker_ind, threads, step, mkl, verbose) +kin_cal <- function(pBigMat, geno_ind = NULL, marker_ind = NULL, marker_freq = NULL, marker_bycol = TRUE, threads = 0L, step = 10000L, mkl = FALSE, verbose = TRUE) { + .Call(`_rMVP_kin_cal`, pBigMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose) } diff --git a/README.md b/README.md index 515744f..5ddb3fd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# rMVP [![](https://img.shields.io/badge/Issues-%2B-brightgreen.svg)](https://github.com/XiaoleiLiuBio/rMVP/issues/new) [![](http://www.r-pkg.org/badges/version/rMVP?color=red)](https://cran.r-project.org/package=rMVP) [![CRAN Version](https://www.r-pkg.org/badges/version/rMVP?color=yellow)](https://CRAN.R-project.org/package=rMVP) [![](https://img.shields.io/badge/GitHub-1.2.0-blueviolet.svg)]() ![](http://cranlogs.r-pkg.org/badges/grand-total/rMVP?color=green) [![](https://cranlogs.r-pkg.org/badges/rMVP)](https://cran.r-project.org/package=rMVP) +# rMVP [![](https://img.shields.io/badge/Issues-%2B-brightgreen.svg)](https://github.com/XiaoleiLiuBio/rMVP/issues/new) [![](http://www.r-pkg.org/badges/version/rMVP?color=red)](https://cran.r-project.org/package=rMVP) [![CRAN Version](https://www.r-pkg.org/badges/version/rMVP?color=yellow)](https://CRAN.R-project.org/package=rMVP) [![](https://img.shields.io/badge/GitHub-1.3.0-blueviolet.svg)]() ![](http://cranlogs.r-pkg.org/badges/grand-total/rMVP?color=green) [![](https://cranlogs.r-pkg.org/badges/rMVP)](https://cran.r-project.org/package=rMVP) ## An [r](https://) package for [M](https://)emory-efficient, [V](https://)isualization-enhanced, and [P](https://)arallel-accelerated Genome-Wide Association Study @@ -227,7 +227,7 @@ MVP.Data(fileHMP=c("hmp.chr1.txt", "hmp.chr2.txt", "hmp.chr3.txt", "hmp.chr4.txt ### Numeric **[back to top](#contents)** -If you have genotype data in **Numeric** (m * n, m rows and n columns, m is the number of SNPs, n is the number of individuals) format:   +If you have genotype data in **Numeric** (either m * n or n * m is acceptable, m is the number of SNPs, n is the number of individuals) format:   **fileNum**, name of genotype data in Numeric format **filePhe**, name of phenotype file diff --git a/inst/extdata/05_mvp/mvp.geno.bin b/inst/extdata/05_mvp/mvp.geno.bin index 400f2cbd1194ee6833a9b5ec8bd9050dc8a466fd..3dc46b8aedd5ec0e9afea41601122bf646afe1da 100644 GIT binary patch literal 151 zcmZXMu@L|;2m*#RJjnX+*kr|tI9hYqbM{vj|i8{cPI_cBy}s{zaa literal 151 zcmYjJ2@(K61Zut9)3?JO8Ki(J08IBbr6AZ)8GiaQ=`KPv& geno_ind, const Nullable marker_ind, const bool verbose, const int threads); -RcppExport SEXP _rMVP_glm_c(SEXP ySEXP, SEXP XSEXP, SEXP iXXSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP verboseSEXP, SEXP threadsSEXP) { +SEXP glm_c(const arma::vec& y, const arma::mat& X, const arma::mat& iXX, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const int step, const bool verbose, const int threads); +RcppExport SEXP _rMVP_glm_c(SEXP ySEXP, SEXP XSEXP, SEXP iXXSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP stepSEXP, SEXP verboseSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -36,15 +36,16 @@ BEGIN_RCPP Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP); Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP); + Rcpp::traits::input_parameter< const int >::type step(stepSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP); - rcpp_result_gen = Rcpp::wrap(glm_c(y, X, iXX, pBigMat, geno_ind, marker_ind, verbose, threads)); + rcpp_result_gen = Rcpp::wrap(glm_c(y, X, iXX, pBigMat, geno_ind, marker_ind, step, verbose, threads)); return rcpp_result_gen; END_RCPP } // mlm_c -SEXP mlm_c(const arma::vec& y, const arma::mat& X, const arma::mat& U, const double vgs, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const bool verbose, const int threads); -RcppExport SEXP _rMVP_mlm_c(SEXP ySEXP, SEXP XSEXP, SEXP USEXP, SEXP vgsSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP verboseSEXP, SEXP threadsSEXP) { +SEXP mlm_c(const arma::vec& y, const arma::mat& X, const arma::mat& U, const double vgs, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const int step, const bool verbose, const int threads); +RcppExport SEXP _rMVP_mlm_c(SEXP ySEXP, SEXP XSEXP, SEXP USEXP, SEXP vgsSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP stepSEXP, SEXP verboseSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -55,9 +56,10 @@ BEGIN_RCPP Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP); Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP); + Rcpp::traits::input_parameter< const int >::type step(stepSEXP); Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP); - rcpp_result_gen = Rcpp::wrap(mlm_c(y, X, U, vgs, pBigMat, geno_ind, marker_ind, verbose, threads)); + rcpp_result_gen = Rcpp::wrap(mlm_c(y, X, U, vgs, pBigMat, geno_ind, marker_ind, step, verbose, threads)); return rcpp_result_gen; END_RCPP } @@ -126,15 +128,16 @@ BEGIN_RCPP END_RCPP } // write_bfile -void write_bfile(SEXP pBigMat, std::string bed_file, int threads, bool verbose); -RcppExport SEXP _rMVP_write_bfile(SEXP pBigMatSEXP, SEXP bed_fileSEXP, SEXP threadsSEXP, SEXP verboseSEXP) { +void write_bfile(SEXP pBigMat, std::string bed_file, bool mrkbycol, int threads, bool verbose); +RcppExport SEXP _rMVP_write_bfile(SEXP pBigMatSEXP, SEXP bed_fileSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); Rcpp::traits::input_parameter< std::string >::type bed_file(bed_fileSEXP); + Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP); Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - write_bfile(pBigMat, bed_file, threads, verbose); + write_bfile(pBigMat, bed_file, mrkbycol, threads, verbose); return R_NilValue; END_RCPP } @@ -194,40 +197,44 @@ BEGIN_RCPP END_RCPP } // impute_marker -void impute_marker(SEXP pBigMat, int threads, bool verbose); -RcppExport SEXP _rMVP_impute_marker(SEXP pBigMatSEXP, SEXP threadsSEXP, SEXP verboseSEXP) { +void impute_marker(SEXP pBigMat, bool mrkbycol, int threads, bool verbose); +RcppExport SEXP _rMVP_impute_marker(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); + Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP); Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - impute_marker(pBigMat, threads, verbose); + impute_marker(pBigMat, mrkbycol, threads, verbose); return R_NilValue; END_RCPP } // hasNA -bool hasNA(SEXP pBigMat, const Nullable geno_ind, const int threads); -RcppExport SEXP _rMVP_hasNA(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP threadsSEXP) { +bool hasNA(SEXP pBigMat, bool mrkbycol, const Nullable geno_ind, const Nullable marker_ind, const int threads); +RcppExport SEXP _rMVP_hasNA(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP threadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); + Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP); Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP); + Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP); Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP); - rcpp_result_gen = Rcpp::wrap(hasNA(pBigMat, geno_ind, threads)); + rcpp_result_gen = Rcpp::wrap(hasNA(pBigMat, mrkbycol, geno_ind, marker_ind, threads)); return rcpp_result_gen; END_RCPP } // BigRowMean -arma::vec BigRowMean(SEXP pBigMat, int threads, const Nullable geno_ind); -RcppExport SEXP _rMVP_BigRowMean(SEXP pBigMatSEXP, SEXP threadsSEXP, SEXP geno_indSEXP) { +arma::vec BigRowMean(SEXP pBigMat, bool mrkbycol, int threads, const Nullable geno_ind); +RcppExport SEXP _rMVP_BigRowMean(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP geno_indSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); + Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP); Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP); - rcpp_result_gen = Rcpp::wrap(BigRowMean(pBigMat, threads, geno_ind)); + rcpp_result_gen = Rcpp::wrap(BigRowMean(pBigMat, mrkbycol, threads, geno_ind)); return rcpp_result_gen; END_RCPP } @@ -259,43 +266,45 @@ BEGIN_RCPP END_RCPP } // kin_cal -SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, int threads, size_t step, bool mkl, bool verbose); -RcppExport SEXP _rMVP_kin_cal(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP threadsSEXP, SEXP stepSEXP, SEXP mklSEXP, SEXP verboseSEXP) { +SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const Nullable marker_freq, const bool marker_bycol, int threads, size_t step, bool mkl, bool verbose); +RcppExport SEXP _rMVP_kin_cal(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP marker_freqSEXP, SEXP marker_bycolSEXP, SEXP threadsSEXP, SEXP stepSEXP, SEXP mklSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP); Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP); Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP); + Rcpp::traits::input_parameter< const Nullable >::type marker_freq(marker_freqSEXP); + Rcpp::traits::input_parameter< const bool >::type marker_bycol(marker_bycolSEXP); Rcpp::traits::input_parameter< int >::type threads(threadsSEXP); Rcpp::traits::input_parameter< size_t >::type step(stepSEXP); Rcpp::traits::input_parameter< bool >::type mkl(mklSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - rcpp_result_gen = Rcpp::wrap(kin_cal(pBigMat, geno_ind, marker_ind, threads, step, mkl, verbose)); + rcpp_result_gen = Rcpp::wrap(kin_cal(pBigMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_rMVP_getRow", (DL_FUNC) &_rMVP_getRow, 2}, - {"_rMVP_glm_c", (DL_FUNC) &_rMVP_glm_c, 8}, - {"_rMVP_mlm_c", (DL_FUNC) &_rMVP_mlm_c, 9}, + {"_rMVP_glm_c", (DL_FUNC) &_rMVP_glm_c, 9}, + {"_rMVP_mlm_c", (DL_FUNC) &_rMVP_mlm_c, 10}, {"_rMVP_vcf_parser_map", (DL_FUNC) &_rMVP_vcf_parser_map, 2}, {"_rMVP_vcf_parser_genotype", (DL_FUNC) &_rMVP_vcf_parser_genotype, 5}, {"_rMVP_hapmap_parser_map", (DL_FUNC) &_rMVP_hapmap_parser_map, 2}, {"_rMVP_hapmap_parser_genotype", (DL_FUNC) &_rMVP_hapmap_parser_genotype, 6}, {"_rMVP_numeric_scan", (DL_FUNC) &_rMVP_numeric_scan, 1}, - {"_rMVP_write_bfile", (DL_FUNC) &_rMVP_write_bfile, 4}, + {"_rMVP_write_bfile", (DL_FUNC) &_rMVP_write_bfile, 5}, {"_rMVP_read_bfile", (DL_FUNC) &_rMVP_read_bfile, 5}, {"_rMVP_fit_diago_brent", (DL_FUNC) &_rMVP_fit_diago_brent, 9}, {"_rMVP_crossprodcpp", (DL_FUNC) &_rMVP_crossprodcpp, 1}, {"_rMVP_geninv", (DL_FUNC) &_rMVP_geninv, 1}, - {"_rMVP_impute_marker", (DL_FUNC) &_rMVP_impute_marker, 3}, - {"_rMVP_hasNA", (DL_FUNC) &_rMVP_hasNA, 3}, - {"_rMVP_BigRowMean", (DL_FUNC) &_rMVP_BigRowMean, 3}, + {"_rMVP_impute_marker", (DL_FUNC) &_rMVP_impute_marker, 4}, + {"_rMVP_hasNA", (DL_FUNC) &_rMVP_hasNA, 5}, + {"_rMVP_BigRowMean", (DL_FUNC) &_rMVP_BigRowMean, 4}, {"_rMVP_kin_cal_m", (DL_FUNC) &_rMVP_kin_cal_m, 3}, {"_rMVP_kin_cal_s", (DL_FUNC) &_rMVP_kin_cal_s, 4}, - {"_rMVP_kin_cal", (DL_FUNC) &_rMVP_kin_cal, 7}, + {"_rMVP_kin_cal", (DL_FUNC) &_rMVP_kin_cal, 9}, {NULL, NULL, 0} }; diff --git a/src/assoc.cpp b/src/assoc.cpp index 7d67bdc..eb34db0 100755 --- a/src/assoc.cpp +++ b/src/assoc.cpp @@ -67,19 +67,20 @@ NumericVector getRow(SEXP pBigMat, const int row){ } template -SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){ +SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){ omp_setup(threads); MatrixAccessor genomat = MatrixAccessor(*pMat); - + bool marker_bycol = (y.n_elem == pMat->nrow()); + int n; uvec _geno_ind; if(geno_ind.isNotNull()){ _geno_ind = as(geno_ind) - 1; n = _geno_ind.n_elem; }else{ - n = pMat->ncol(); + n = marker_bycol ? pMat->nrow() : pMat->ncol(); } int m; @@ -88,7 +89,7 @@ SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr(marker_ind) - 1; m = _marker_ind.n_elem; }else{ - m = pMat->nrow(); + m = marker_bycol ? pMat->ncol() : pMat->nrow(); } int q0 = X.n_cols; @@ -101,111 +102,174 @@ SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){ +SEXP glm_c(const arma::vec & y, const arma::mat & X, const arma::mat & iXX, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){ XPtr xpMat(pBigMat); switch(xpMat->matrix_type()){ case 1: - return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads); + return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads); case 2: - return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads); + return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads); case 4: - return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads); + return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads); case 8: - return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads); + return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } } template -SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){ +SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){ omp_setup(threads); MatrixAccessor genomat = MatrixAccessor(*pMat); + bool marker_bycol = (y.n_elem == pMat->nrow()); int n; uvec _geno_ind; @@ -213,7 +277,7 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const _geno_ind = as(geno_ind) - 1; n = _geno_ind.n_elem; }else{ - n = pMat->ncol(); + n = marker_bycol ? pMat->nrow() : pMat->ncol(); } int m; @@ -222,7 +286,7 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const _marker_ind = as(marker_ind) - 1; m = _marker_ind.n_elem; }else{ - m = pMat->nrow(); + m = marker_bycol ? pMat->ncol() : pMat->nrow(); } int q0 = X.n_cols; @@ -237,79 +301,144 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const arma::mat iUXUX = GInv(UX.t() * UX); arma::mat res(m, 3); - arma::vec snp(n); arma::mat iXXs(q0 + 1, q0 + 1); - #pragma omp parallel for schedule(dynamic) firstprivate(snp, iXXs) - for(int i = 0; i < m; i++){ + + arma::mat Z_buffer(n, step, fill::none); + int i = 0, j = 0; + int i_marker = 0; + for (;i < m;) { + + int cnt = 0; + for (; j < m && cnt < step; j++) + { + cnt++; + } + + if (cnt != step) { + Z_buffer.set_size(n, cnt); + } if(_geno_ind.is_empty()){ if(_marker_ind.is_empty()){ - for(int ii = 0; ii < n; ii++){ - snp[ii] = genomat[ii][i]; + if(marker_bycol){ + #pragma omp parallel for + for(int l = 0; l < cnt; l++){ + for(int k = 0; k < n; k++){ + Z_buffer(k, l) = genomat[(i_marker + l)][k]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(k, l) = genomat[k][(i_marker + l)]; + } + } } }else{ - for(int ii = 0; ii < n; ii++){ - snp[ii] = genomat[ii][_marker_ind[i]]; + if(marker_bycol){ + #pragma omp parallel for + for(int l = 0; l < cnt; l++){ + for(int k = 0; k < n; k++){ + Z_buffer(k, l) = genomat[_marker_ind[(i_marker + l)]][k]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(k, l) = genomat[k][_marker_ind[(i_marker + l)]]; + } + } } } }else{ if(_marker_ind.is_empty()){ - for(int ii = 0; ii < n; ii++){ - snp[ii] = genomat[_geno_ind[ii]][i]; + if(marker_bycol){ + #pragma omp parallel for + for(int l = 0; l < cnt; l++){ + for(int k = 0; k < n; k++){ + Z_buffer(k, l) = genomat[(i_marker + l)][_geno_ind[k]]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(k, l) = genomat[_geno_ind[k]][(i_marker + l)]; + } + } } }else{ - for(int ii = 0; ii < n; ii++){ - snp[ii] = genomat[_geno_ind[ii]][_marker_ind[i]]; + if(marker_bycol){ + #pragma omp parallel for + for(int l = 0; l < cnt; l++){ + for(int k = 0; k < n; k++){ + Z_buffer(k, l) = genomat[_marker_ind[(i_marker + l)]][_geno_ind[k]]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(k, l) = genomat[_geno_ind[k]][_marker_ind[(i_marker + l)]]; + } + } } } } - - arma::mat Us = U.t() * snp; - arma::mat UXUs = UX.t() * Us; - - double UsUs = as_scalar(Us.t() * Us); - double UsUy = as_scalar(Us.t() * Uy); - double B22 = UsUs - as_scalar(UXUs.t() * iUXUX * UXUs); - double invB22 = 1 / B22; - arma::mat B21 = UXUs.t() * iUXUX; - arma::mat NeginvB22B21 = -1 * invB22 * B21; - - iXXs(q0, q0)=invB22; - iXXs.submat(0, 0, q0 - 1, q0 - 1) = iUXUX + invB22 * B21.t() * B21; - iXXs(q0, span(0, q0 - 1)) = NeginvB22B21; - iXXs(span(0, q0 - 1), q0) = NeginvB22B21.t(); - - // statistics - arma::mat rhs(UXUy.n_rows + 1, 1); - rhs.rows(0, UXUy.n_rows - 1) = UXUy; - rhs(UXUy.n_rows, 0) = UsUy; - arma::mat beta = iXXs * rhs; - int df = n - q0 - 1; - - res(i, 0) = beta(q0, 0); - res(i, 1) = sqrt(iXXs(q0, q0) * vgs); - res(i, 2) = 2 * R::pt(abs(res(i, 0) / res(i, 1)), df, false, false); - progress.increment(); - } + #pragma omp parallel for firstprivate(iXXs) + for(int l = 0; l < cnt; l++){ + arma::mat Us = U.t() * Z_buffer.col(l); + arma::mat UXUs = UX.t() * Us; + double UsUs = as_scalar(Us.t() * Us); + double UsUy = as_scalar(Us.t() * Uy); + double B22 = UsUs - as_scalar(UXUs.t() * iUXUX * UXUs); + double invB22 = 1 / B22; + arma::mat B21 = UXUs.t() * iUXUX; + arma::mat NeginvB22B21 = -1 * invB22 * B21; + + iXXs(q0, q0)=invB22; + iXXs.submat(0, 0, q0 - 1, q0 - 1) = iUXUX + invB22 * B21.t() * B21; + iXXs(q0, span(0, q0 - 1)) = NeginvB22B21; + iXXs(span(0, q0 - 1), q0) = NeginvB22B21.t(); + + // statistics + arma::mat rhs(UXUy.n_rows + 1, 1); + rhs.rows(0, UXUy.n_rows - 1) = UXUy; + rhs(UXUy.n_rows, 0) = UsUy; + arma::mat beta = iXXs * rhs; + int df = n - q0 - 1; + + res(i_marker + l, 0) = beta(q0, 0); + res(i_marker + l, 1) = sqrt(iXXs(q0, q0) * vgs); + res(i_marker + l, 2) = 2 * R::pt(abs(res(i_marker + l, 0) / res(i_marker + l, 1)), df, false, false); + } + + i = j; + i_marker += cnt; + if(!Progress::check_abort()) progress.increment(cnt); + } + Z_buffer.reset(); return wrap(res); } // [[Rcpp::export]] -SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){ +SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){ XPtr xpMat(pBigMat); switch(xpMat->matrix_type()){ case 1: - return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads); + return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads); case 2: - return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads); + return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads); case 4: - return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads); + return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads); case 8: - return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads); + return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } diff --git a/src/data_converter.cpp b/src/data_converter.cpp index 18c886d..ad52aec 100755 --- a/src/data_converter.cpp +++ b/src/data_converter.cpp @@ -138,7 +138,7 @@ void vcf_parser_genotype(std::string vcf_file, XPtr pMat, long maxLin // progress bar MinimalProgressBar_perc pb; - Progress progress(pMat->nrow(), verbose, pb); + Progress progress(pMat->ncol(), verbose, pb); // Skip Header string prefix("#CHROM"); @@ -179,7 +179,7 @@ void vcf_parser_genotype(std::string vcf_file, XPtr pMat, long maxLin // mat[j][m + i] = markers[j]; // } for(size_t j = 0; j < (l.size() - 9); j++) { - mat[j][m + i] = static_cast(vcf_marker_parser(l[j + 9], NA_C)); + mat[m + i][j] = static_cast(vcf_marker_parser(l[j + 9], NA_C)); } } progress.increment(buffer.size()); @@ -339,7 +339,7 @@ void hapmap_parser_genotype(std::string hmp_file, std::vector Major // progress bar MinimalProgressBar_perc pb; - Progress progress(pMat->nrow(), verbose, pb); + Progress progress(pMat->ncol(), verbose, pb); // Skip Header string prefix("rs#"); @@ -380,7 +380,7 @@ void hapmap_parser_genotype(std::string hmp_file, std::vector Major Rcpp::stop(("line " + to_string(m+i+2) + " does not have " + to_string(n_col) + " elements in HAPMAP file.").c_str()); major = Major[m + i][0]; for(size_t j = 0; j < (l.size() - 11); j++) { - mat[j][m + i] = static_cast(hapmap_marker_parser(l[j + 11], major, NA_C)); + mat[m + i][j] = static_cast(hapmap_marker_parser(l[j + 11], major, NA_C)); } } progress.increment(n_marker); @@ -436,7 +436,7 @@ List numeric_scan(std::string num_file) { // ***** BFILE ***** template -void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int threads=0, bool verbose=true) { +void write_bfile(XPtr pMat, std::string bed_file, double NA_C, bool mrkbycol = true, int threads=0, bool verbose=true) { // check input string ending = ".bed"; if (bed_file.length() <= ending.length() || @@ -447,10 +447,10 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th // define T c; omp_setup(threads); - int m = pMat->nrow(); - int nind = pMat->ncol(); - int n = pMat->ncol() / 4; // 4 individual = 1 bit - if (pMat->ncol() % 4 != 0) + int m = (mrkbycol ? pMat->ncol() : pMat->nrow()); + int nind = (mrkbycol ? pMat->nrow() : pMat->ncol()); + int n = nind / 4; // 4 individual = 1 bit + if (nind % 4 != 0) n++; vector geno(n); @@ -460,7 +460,7 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th // progress bar MinimalProgressBar_perc pb; - Progress progress(pMat->nrow(), verbose, pb); + Progress progress(m, verbose, pb); // magic number of bfile const unsigned char magic_bytes[] = { 0x6c, 0x1b, 0x01 }; @@ -474,36 +474,52 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th code[static_cast(NA_C)] = 1; // write bfile - for (int i = 0; i < m; i++) { - #pragma omp parallel for private(c) - for (int j = 0; j < n; j++) { - uint8_t p = 0; - for (int x = 0; x < 4 && (4 * j + x) < nind; x++) { - c = mat[4 * j + x][i]; - p |= code[c] << (x*2); + if(mrkbycol){ + for (int i = 0; i < m; i++) { + #pragma omp parallel for private(c) + for (int j = 0; j < n; j++) { + uint8_t p = 0; + for (int x = 0; x < 4 && (4 * j + x) < nind; x++) { + c = mat[i][4 * j + x]; + p |= code[c] << (x*2); + } + geno[j] = p; } - geno[j] = p; + fwrite((char*)geno.data(), 1, geno.size(), fout); + progress.increment(); + } + }else{ + for (int i = 0; i < m; i++) { + #pragma omp parallel for private(c) + for (int j = 0; j < n; j++) { + uint8_t p = 0; + for (int x = 0; x < 4 && (4 * j + x) < nind; x++) { + c = mat[4 * j + x][i]; + p |= code[c] << (x*2); + } + geno[j] = p; + } + fwrite((char*)geno.data(), 1, geno.size(), fout); + progress.increment(); } - fwrite((char*)geno.data(), 1, geno.size(), fout); - progress.increment(); } fclose(fout); return; } // [[Rcpp::export]] -void write_bfile(SEXP pBigMat, std::string bed_file, int threads=0, bool verbose=true) { +void write_bfile(SEXP pBigMat, std::string bed_file, bool mrkbycol = true, int threads=0, bool verbose=true) { XPtr xpMat(pBigMat); switch(xpMat->matrix_type()) { case 1: - return write_bfile(xpMat, bed_file, NA_CHAR, threads, verbose); + return write_bfile(xpMat, bed_file, NA_CHAR, mrkbycol, threads, verbose); case 2: - return write_bfile(xpMat, bed_file, NA_SHORT, threads, verbose); + return write_bfile(xpMat, bed_file, NA_SHORT, mrkbycol, threads, verbose); case 4: - return write_bfile(xpMat, bed_file, NA_INTEGER, threads, verbose); + return write_bfile(xpMat, bed_file, NA_INTEGER, mrkbycol, threads, verbose); case 8: - return write_bfile(xpMat, bed_file, NA_REAL, threads, verbose); + return write_bfile(xpMat, bed_file, NA_REAL, mrkbycol, threads, verbose); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } @@ -518,7 +534,7 @@ void read_bfile(std::string bed_file, XPtr pMat, long maxLine, double // define omp_setup(threads); - size_t ind = pMat->ncol(); + size_t ind = pMat->nrow(); long n = ind / 4; // 4 individual = 1 bit if (ind % 4 != 0) n++; @@ -572,7 +588,7 @@ void read_bfile(std::string bed_file, XPtr pMat, long maxLine, double uint8_t p = buffer[j]; for (size_t x = 0; x < 4 && (c + x) < ind; x++) { - mat[c + x][r] = code[(p >> (2*x)) & 0x03]; + mat[r][c + x] = code[(p >> (2*x)) & 0x03]; } } progress.increment(); diff --git a/src/impute.cpp b/src/impute.cpp index 7b20ca9..edde606 100755 --- a/src/impute.cpp +++ b/src/impute.cpp @@ -23,116 +23,200 @@ using namespace Rcpp; using namespace arma; template -void impute_marker(XPtr pMat, int threads=0, bool verbose=true) { +void impute_marker(XPtr pMat, bool mrkbycol = true, int threads=0, bool verbose=true) { omp_setup(threads); - MinimalProgressBar_perc pb; - Progress progress(pMat->nrow(), verbose, pb); - MatrixAccessor mat = MatrixAccessor(*pMat); - const size_t n = pMat->ncol(); - const size_t m = pMat->nrow(); + const size_t n = (mrkbycol ? pMat->nrow() : pMat->ncol()); + const size_t m = (mrkbycol ? pMat->ncol() : pMat->nrow()); + + MinimalProgressBar_perc pb; + Progress progress(m, verbose, pb); // loop marker - #pragma omp parallel for - for (size_t i = 0; i < m; i++) { - std::vector na_index = {};; - size_t counts[3] = { 0 }; - - // count allele, record missing index - for (size_t j = 0; j < n; j++) { - switch(int(mat[j][i])) { - case 0: counts[0]++; break; - case 1: counts[1]++; break; - case 2: counts[2]++; break; - default: na_index.push_back(j); + if(mrkbycol){ + #pragma omp parallel for + for (size_t i = 0; i < m; i++) { + std::vector na_index = {};; + size_t counts[3] = { 0 }; + + // count allele, record missing index + for (size_t j = 0; j < n; j++) { + switch(int(mat[i][j])) { + case 0: counts[0]++; break; + case 1: counts[1]++; break; + case 2: counts[2]++; break; + default: na_index.push_back(j); + } } + + // find major allele + T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0); + + // impute + for (auto&& x: na_index) { + mat[i][x] = major; + } + progress.increment(); } - - // find major allele - T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0); - - // impute - for (auto&& x: na_index) { - mat[x][i] = major; + }else{ + #pragma omp parallel for + for (size_t i = 0; i < m; i++) { + std::vector na_index = {};; + size_t counts[3] = { 0 }; + + // count allele, record missing index + for (size_t j = 0; j < n; j++) { + switch(int(mat[j][i])) { + case 0: counts[0]++; break; + case 1: counts[1]++; break; + case 2: counts[2]++; break; + default: na_index.push_back(j); + } + } + + // find major allele + T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0); + + // impute + for (auto&& x: na_index) { + mat[x][i] = major; + } + progress.increment(); } - progress.increment(); } } // [[Rcpp::export]] -void impute_marker(SEXP pBigMat, int threads=0, bool verbose=true) { +void impute_marker(SEXP pBigMat, bool mrkbycol = true, int threads=0, bool verbose=true) { XPtr xpMat(pBigMat); switch(xpMat->matrix_type()) { case 1: - return impute_marker(xpMat, threads, verbose); + return impute_marker(xpMat, mrkbycol, threads, verbose); case 2: - return impute_marker(xpMat, threads, verbose); + return impute_marker(xpMat, mrkbycol, threads, verbose); case 4: - return impute_marker(xpMat, threads, verbose); + return impute_marker(xpMat, mrkbycol, threads, verbose); case 8: - return impute_marker(xpMat, threads, verbose); + return impute_marker(xpMat, mrkbycol, threads, verbose); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } } template -bool hasNA(XPtr pMat, double NA_C, const Nullable geno_ind=R_NilValue, const int threads=1) { +bool hasNA(XPtr pMat, double NA_C, bool mrkbycol = true, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int threads = 1) { omp_setup(threads); - size_t m = pMat->nrow(); - size_t n; - - uvec _geno_ind; - if(geno_ind.isNotNull()){ - _geno_ind = as(geno_ind) - 1; - n = _geno_ind.n_elem; - }else{ - n = pMat->ncol(); - } bool HasNA = false; MatrixAccessor mat = MatrixAccessor(*pMat); - if(_geno_ind.is_empty()){ - #pragma omp parallel for shared(HasNA) - for (size_t j = 0; j < n; j++) { - if(HasNA) continue; - for (size_t i = 0; i < m; i++) { - if (mat[j][i] == NA_C) { - HasNA = true; + if(geno_ind.isNotNull()){ + uvec _geno_ind = as(geno_ind) - 1; + int n = _geno_ind.n_elem; + if(marker_ind.isNotNull()){ + uvec _marker_ind = as(marker_ind) - 1; + int m = _marker_ind.n_elem; + if(mrkbycol){ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < m; j++) { + if(HasNA) continue; + for (int i = 0; i < n; i++) { + if (mat[_marker_ind[j]][_geno_ind[i]] == NA_C) { + HasNA = true; + } + } + } + }else{ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < n; j++) { + if(HasNA) continue; + for (int i = 0; i < m; i++) { + if (mat[_geno_ind[j]][_marker_ind[i]] == NA_C) { + HasNA = true; + } + } + } + } + }else{ + if(mrkbycol){ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < pMat->ncol(); j++) { + if(HasNA) continue; + for (int i = 0; i < n; i++) { + if (mat[j][_geno_ind[i]] == NA_C) { + HasNA = true; + } + } + } + }else{ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < n; j++) { + if(HasNA) continue; + for (int i = 0; i < pMat->nrow(); i++) { + if (mat[_geno_ind[j]][i] == NA_C) { + HasNA = true; + } + } } } } }else{ - #pragma omp parallel for shared(HasNA) - for (size_t j = 0; j < n; j++) { - if(HasNA) continue; - for (size_t i = 0; i < m; i++) { - if (mat[_geno_ind[j]][i] == NA_C) { - HasNA = true; + if(marker_ind.isNotNull()){ + uvec _marker_ind = as(marker_ind) - 1; + int m = _marker_ind.n_elem; + if(mrkbycol){ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < m; j++) { + if(HasNA) continue; + for (int i = 0; i < pMat->nrow(); i++) { + if (mat[_marker_ind[j]][i] == NA_C) { + HasNA = true; + } + } + } + }else{ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < pMat->ncol(); j++) { + if(HasNA) continue; + for (int i = 0; i < m; i++) { + if (mat[j][_marker_ind[i]] == NA_C) { + HasNA = true; + } + } + } + } + }else{ + #pragma omp parallel for shared(HasNA) + for (int j = 0; j < pMat->ncol(); j++) { + if(HasNA) continue; + for (int i = 0; i < pMat->nrow(); i++) { + if (mat[j][i] == NA_C) { + HasNA = true; + } } } } } + return HasNA; } // [[Rcpp::export]] -bool hasNA(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const int threads=1) { +bool hasNA(SEXP pBigMat, bool mrkbycol = true, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int threads=1) { XPtr xpMat(pBigMat); switch(xpMat->matrix_type()) { case 1: - return hasNA(xpMat, NA_CHAR, geno_ind, threads); + return hasNA(xpMat, NA_CHAR, mrkbycol, geno_ind, marker_ind, threads); case 2: - return hasNA(xpMat, NA_SHORT, geno_ind, threads); + return hasNA(xpMat, NA_SHORT, mrkbycol, geno_ind, marker_ind, threads); case 4: - return hasNA(xpMat, NA_INTEGER, geno_ind, threads); + return hasNA(xpMat, NA_INTEGER, mrkbycol, geno_ind, marker_ind, threads); case 8: - return hasNA(xpMat, NA_REAL, geno_ind, threads); + return hasNA(xpMat, NA_REAL, mrkbycol, geno_ind, marker_ind, threads); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } diff --git a/src/kinship.cpp b/src/kinship.cpp index 652ebbc..f993274 100755 --- a/src/kinship.cpp +++ b/src/kinship.cpp @@ -14,13 +14,13 @@ using namespace Rcpp; using namespace arma; template -arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable geno_ind = R_NilValue){ +arma::vec BigRowMean(XPtr pMat, bool mrkbycol = true, int threads = 0, const Nullable geno_ind = R_NilValue){ omp_setup(threads); MatrixAccessor bigm = MatrixAccessor(*pMat); int n; - int m = pMat->nrow(); + int m = mrkbycol ? pMat->ncol() : pMat->nrow(); arma::vec mean(m); uvec _geno_ind; @@ -28,26 +28,48 @@ arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable(geno_ind) - 1; n = _geno_ind.n_elem; }else{ - n = pMat->ncol(); + n = mrkbycol ? pMat->nrow() : pMat->ncol(); } - if(_geno_ind.is_empty()){ - #pragma omp parallel for - for (int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[k][j]; + if(mrkbycol){ + if(_geno_ind.is_empty()){ + #pragma omp parallel for + for (int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[j][k]; + } + mean[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for (int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[j][_geno_ind[k]]; + } + mean[j] = p1 / n; } - mean[j] = p1 / n; } }else{ - #pragma omp parallel for - for (int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[_geno_ind[k]][j]; + if(_geno_ind.is_empty()){ + #pragma omp parallel for + for (int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[k][j]; + } + mean[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for (int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[_geno_ind[k]][j]; + } + mean[j] = p1 / n; } - mean[j] = p1 / n; } } @@ -55,19 +77,19 @@ arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable geno_ind = R_NilValue){ +arma::vec BigRowMean(SEXP pBigMat, bool mrkbycol = true, int threads = 0, const Nullable geno_ind = R_NilValue){ XPtr xpMat(pBigMat); switch(xpMat->matrix_type()) { case 1: - return BigRowMean(xpMat, threads, geno_ind); + return BigRowMean(xpMat, mrkbycol, threads, geno_ind); case 2: - return BigRowMean(xpMat, threads, geno_ind); + return BigRowMean(xpMat, mrkbycol, threads, geno_ind); case 4: - return BigRowMean(xpMat, threads, geno_ind); + return BigRowMean(xpMat, mrkbycol, threads, geno_ind); case 8: - return BigRowMean(xpMat, threads, geno_ind); + return BigRowMean(xpMat, mrkbycol, threads, geno_ind); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } @@ -228,7 +250,7 @@ SEXP kin_cal_s(SEXP pBigMat, int threads = 0, bool mkl = false, bool verbose = t template -SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, int threads = 0, int step = 5000, bool mkl = false, bool verbose = true){ +SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const Nullable marker_freq = R_NilValue, const bool marker_bycol = true, int threads = 0, int step = 5000, bool mkl = false, bool verbose = true){ omp_setup(threads); @@ -248,7 +270,7 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa _geno_ind = as(geno_ind) - 1; n = _geno_ind.n_elem; }else{ - n = pMat->ncol(); + n = marker_bycol ? pMat->nrow() : pMat->ncol(); } int m; @@ -257,52 +279,102 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa _marker_ind = as(marker_ind) - 1; m = _marker_ind.n_elem; }else{ - m = pMat->nrow(); + m = marker_bycol ? pMat->ncol() : pMat->nrow(); } - arma::vec means(m); - if(_geno_ind.is_empty()){ - if(_marker_ind.is_empty()){ - #pragma omp parallel for - for(int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[k][j]; - } - means[j] = p1 / n; - } - }else{ - #pragma omp parallel for - for(int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[k][_marker_ind[j]]; - } - means[j] = p1 / n; - } - } + vec means; + if(marker_freq.isNotNull()){ + means = as(marker_freq) * 2; }else{ - if(_marker_ind.is_empty()){ - #pragma omp parallel for - for(int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[_geno_ind[k]][j]; + means.resize(m); + if(_geno_ind.is_empty()){ + if(_marker_ind.is_empty()){ + if(marker_bycol){ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[j][k]; + } + means[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[k][j]; + } + means[j] = p1 / n; + } + } + }else{ + if(marker_bycol){ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[_marker_ind[j]][k]; + } + means[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[k][_marker_ind[j]]; + } + means[j] = p1 / n; + } } - means[j] = p1 / n; } }else{ - #pragma omp parallel for - for(int j = 0; j < m; j++){ - double p1 = 0.0; - for(int k = 0; k < n; k++){ - p1 += bigm[_geno_ind[k]][_marker_ind[j]]; + if(_marker_ind.is_empty()){ + if(marker_bycol){ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[j][_geno_ind[k]]; + } + means[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[_geno_ind[k]][j]; + } + means[j] = p1 / n; + } + } + }else{ + if(marker_bycol){ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[_marker_ind[j]][_geno_ind[k]]; + } + means[j] = p1 / n; + } + }else{ + #pragma omp parallel for + for(int j = 0; j < m; j++){ + double p1 = 0.0; + for(int k = 0; k < n; k++){ + p1 += bigm[_geno_ind[k]][_marker_ind[j]]; + } + means[j] = p1 / n; + } } - means[j] = p1 / n; } } + if(means.has_nan()) throw Rcpp::exception("NA is not allowed in genotype, use 'MVP.Data.impute' to impute!"); } - + arma::mat kin = zeros(n, n); arma::mat Z_buffer(step, n, fill::none); @@ -325,33 +397,69 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa if(_geno_ind.is_empty()){ if(_marker_ind.is_empty()){ - #pragma omp parallel for - for(int k = 0; k < n; k++){ + if(marker_bycol){ + #pragma omp parallel for for(int l = 0; l < cnt; l++){ - Z_buffer(l, k) = bigm[k][(i_marker + l)] - means[(i_marker + l)]; + for(int k = 0; k < n; k++){ + Z_buffer(l, k) = bigm[(i_marker + l)][k] - means[(i_marker + l)]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(l, k) = bigm[k][(i_marker + l)] - means[(i_marker + l)]; + } } } }else{ - #pragma omp parallel for - for(int k = 0; k < n; k++){ + if(marker_bycol){ + #pragma omp parallel for for(int l = 0; l < cnt; l++){ - Z_buffer(l, k) = bigm[k][_marker_ind[(i_marker + l)]] - means[(i_marker + l)]; + for(int k = 0; k < n; k++){ + Z_buffer(l, k) = bigm[_marker_ind[(i_marker + l)]][k] - means[(i_marker + l)]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(l, k) = bigm[k][_marker_ind[(i_marker + l)]] - means[(i_marker + l)]; + } } } } }else{ if(_marker_ind.is_empty()){ - #pragma omp parallel for - for(int k = 0; k < n; k++){ + if(marker_bycol){ + #pragma omp parallel for for(int l = 0; l < cnt; l++){ - Z_buffer(l, k) = bigm[_geno_ind[k]][(i_marker + l)] - means[(i_marker + l)]; + for(int k = 0; k < n; k++){ + Z_buffer(l, k) = bigm[(i_marker + l)][_geno_ind[k]] - means[(i_marker + l)]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(l, k) = bigm[_geno_ind[k]][(i_marker + l)] - means[(i_marker + l)]; + } } } }else{ - #pragma omp parallel for - for(int k = 0; k < n; k++){ + if(marker_bycol){ + #pragma omp parallel for for(int l = 0; l < cnt; l++){ - Z_buffer(l, k) = bigm[_geno_ind[k]][_marker_ind[(i_marker + l)]] - means[(i_marker + l)]; + for(int k = 0; k < n; k++){ + Z_buffer(l, k) = bigm[_marker_ind[(i_marker + l)]][_geno_ind[k]] - means[(i_marker + l)]; + } + } + }else{ + #pragma omp parallel for + for(int k = 0; k < n; k++){ + for(int l = 0; l < cnt; l++){ + Z_buffer(l, k) = bigm[_geno_ind[k]][_marker_ind[(i_marker + l)]] - means[(i_marker + l)]; + } } } } @@ -391,19 +499,19 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa } // [[Rcpp::export]] -SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, int threads = 0, size_t step = 10000, bool mkl = false, bool verbose = true){ +SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const Nullable marker_freq = R_NilValue, const bool marker_bycol = true, int threads = 0, size_t step = 10000, bool mkl = false, bool verbose = true){ XPtr xpMat(pBigMat); switch(xpMat->matrix_type()){ case 1: - return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose); + return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose); case 2: - return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose); + return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose); case 4: - return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose); + return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose); case 8: - return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose); + return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose); default: throw Rcpp::exception("unknown type detected for big.matrix object!"); } diff --git a/src/mvp_omp.h b/src/mvp_omp.h index 8eb9c99..de1d07f 100755 --- a/src/mvp_omp.h +++ b/src/mvp_omp.h @@ -17,7 +17,6 @@ #if defined(_OPENMP) #include -#else #endif #include