Skip to content

Commit

Permalink
0.1.7 speedups and RAM decrease
Browse files Browse the repository at this point in the history
  • Loading branch information
rwdavies committed Apr 22, 2021
1 parent cd684f0 commit 6bd7b0b
Show file tree
Hide file tree
Showing 15 changed files with 431 additions and 192 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
* v0.1.7
* Speedups and RAM decrease when using reference panels with fewer haplotypes
* v0.1.6
* Discovered bug where only 1 read caused an error. Fix (obviate) this by requiring 2 of more reads through parameter minimum_number_of_sample_reads with default currently 2, where samples with fewer than 2 reads have all missing output
* v0.1.5
Expand Down
6 changes: 3 additions & 3 deletions QUILT.R
Original file line number Diff line number Diff line change
Expand Up @@ -388,9 +388,9 @@ option_list <- list(
),
make_option(
"--use_small_eHapsCurrent_tc",
type = "integer",
help = "For testing purposes only [default TRUE] ",
default = TRUE
type = "logical",
help = "For testing purposes only [default FALSE] ",
default = FALSE
)
)
opt <- suppressWarnings(parse_args(OptionParser(option_list = option_list)))
Expand Down
4 changes: 2 additions & 2 deletions QUILT/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: QUILT
Type: Package
Title: QUILT
Version: 0.1.6
Date: 2021-04-09
Version: 0.1.7
Date: 2021-04-22
Author: Robert William Davies
Maintainer: Robert William Davies <[email protected]>
Description: QUILT performs imputation of individuals sequenced to low coverage
Expand Down
4 changes: 2 additions & 2 deletions QUILT/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ rcpp_fly_weighter <- function(i, gCurrent, gAverage, ll_current, log_mult, ll_re
}

#' @export
rcpp_forwardBackwardGibbsNIPT <- function(sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local = 100L, maxDifferenceBetweenReads = 1000, maxEmissionMatrixDifference = 10000000000, run_fb_subset = FALSE, run_fb_grid_offset = 0L, grid = 0L, snp_start_1_based = -1L, snp_end_1_based = -1L, generate_fb_snp_offsets = FALSE, suppressOutput = 1L, n_gibbs_starts = 1L, n_gibbs_sample_its = 1L, n_gibbs_burn_in_its = 1L, use_starting_read_labels = FALSE, verbose = FALSE, double_list_of_starting_read_labels = NULL, seed_vector = -1L, prev_list_of_alphaBetaBlocks = NULL, i_snp_block_for_alpha_beta = 1L, haploid_gibbs_equal_weighting = TRUE, gibbs_initialize_iteratively = FALSE, gibbs_initialize_at_first_read = TRUE, do_block_resampling = FALSE, artificial_relabel = -1L, pass_in_alphaBeta = FALSE, update_in_place = FALSE, update_hapSum = FALSE, record_read_set = FALSE, class_sum_cutoff = 0.06, perform_block_gibbs = FALSE, shuffle_bin_radius = 5000L, block_gibbs_iterations = as.integer( c(0)), rescale_eMatRead_t = TRUE, use_smooth_cm_in_block_gibbs = FALSE, block_gibbs_quantile_prob = 0.9, do_shard_ff0_block_gibbs = TRUE, use_small_eHapsCurrent_tc = TRUE) {
.Call('_QUILT_rcpp_forwardBackwardGibbsNIPT', PACKAGE = 'QUILT', sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local, maxDifferenceBetweenReads, maxEmissionMatrixDifference, run_fb_subset, run_fb_grid_offset, grid, snp_start_1_based, snp_end_1_based, generate_fb_snp_offsets, suppressOutput, n_gibbs_starts, n_gibbs_sample_its, n_gibbs_burn_in_its, use_starting_read_labels, verbose, double_list_of_starting_read_labels, seed_vector, prev_list_of_alphaBetaBlocks, i_snp_block_for_alpha_beta, haploid_gibbs_equal_weighting, gibbs_initialize_iteratively, gibbs_initialize_at_first_read, do_block_resampling, artificial_relabel, pass_in_alphaBeta, update_in_place, update_hapSum, record_read_set, class_sum_cutoff, perform_block_gibbs, shuffle_bin_radius, block_gibbs_iterations, rescale_eMatRead_t, use_smooth_cm_in_block_gibbs, block_gibbs_quantile_prob, do_shard_ff0_block_gibbs, use_small_eHapsCurrent_tc)
rcpp_forwardBackwardGibbsNIPT <- function(sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, eMatGrid_t1, eMatGrid_t2, eMatGrid_t3, gammaMT_t_local, gammaMU_t_local, gammaP_t_local, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local = 100L, maxDifferenceBetweenReads = 1000, maxEmissionMatrixDifference = 10000000000, run_fb_grid_offset = 0L, grid = 0L, snp_start_1_based = -1L, snp_end_1_based = -1L, generate_fb_snp_offsets = FALSE, suppressOutput = 1L, n_gibbs_starts = 1L, n_gibbs_sample_its = 1L, n_gibbs_burn_in_its = 1L, double_list_of_starting_read_labels = NULL, seed_vector = -1L, prev_list_of_alphaBetaBlocks = NULL, i_snp_block_for_alpha_beta = 1L, haploid_gibbs_equal_weighting = TRUE, gibbs_initialize_iteratively = FALSE, gibbs_initialize_at_first_read = TRUE, do_block_resampling = FALSE, artificial_relabel = -1L, pass_in_alphaBeta = FALSE, update_in_place = FALSE, update_hapSum = FALSE, record_read_set = FALSE, class_sum_cutoff = 0.06, perform_block_gibbs = FALSE, shuffle_bin_radius = 5000L, block_gibbs_iterations = as.integer( c(0)), rescale_eMatRead_t = TRUE, use_smooth_cm_in_block_gibbs = FALSE, block_gibbs_quantile_prob = 0.9, do_shard_ff0_block_gibbs = TRUE, use_small_eHapsCurrent_tc = TRUE) {
.Call('_QUILT_rcpp_forwardBackwardGibbsNIPT', PACKAGE = 'QUILT', sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, eMatGrid_t1, eMatGrid_t2, eMatGrid_t3, gammaMT_t_local, gammaMU_t_local, gammaP_t_local, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local, maxDifferenceBetweenReads, maxEmissionMatrixDifference, run_fb_grid_offset, grid, snp_start_1_based, snp_end_1_based, generate_fb_snp_offsets, suppressOutput, n_gibbs_starts, n_gibbs_sample_its, n_gibbs_burn_in_its, double_list_of_starting_read_labels, seed_vector, prev_list_of_alphaBetaBlocks, i_snp_block_for_alpha_beta, haploid_gibbs_equal_weighting, gibbs_initialize_iteratively, gibbs_initialize_at_first_read, do_block_resampling, artificial_relabel, pass_in_alphaBeta, update_in_place, update_hapSum, record_read_set, class_sum_cutoff, perform_block_gibbs, shuffle_bin_radius, block_gibbs_iterations, rescale_eMatRead_t, use_smooth_cm_in_block_gibbs, block_gibbs_quantile_prob, do_shard_ff0_block_gibbs, use_small_eHapsCurrent_tc)
}

#' @export
Expand Down
78 changes: 35 additions & 43 deletions QUILT/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -467,10 +467,16 @@ get_and_impute_one_sample <- function(
small_transMatRate_tc_H,
alphaHat_t1,
betaHat_t1,
eMatGrid_t1,
alphaHat_t2,
betaHat_t2,
eMatGrid_t2,
alphaHat_t3,
betaHat_t3,
eMatGrid_t3,
gammaMT_t_local,
gammaMU_t_local,
gammaP_t_local,
small_alphaMatCurrent_tc,
small_priorCurrent_m,
small_eHapsCurrent_tc,
Expand Down Expand Up @@ -811,10 +817,16 @@ get_and_impute_one_sample <- function(
small_transMatRate_tc_H = small_transMatRate_tc_H,
alphaHat_t1 = alphaHat_t1,
betaHat_t1 = betaHat_t1,
eMatGrid_t1 = eMatGrid_t1,
alphaHat_t2 = alphaHat_t2,
betaHat_t2 = betaHat_t2,
eMatGrid_t2 = eMatGrid_t2,
alphaHat_t3 = alphaHat_t3,
betaHat_t3 = betaHat_t3,
eMatGrid_t3 = eMatGrid_t3,
gammaMT_t_local = gammaMT_t_local,
gammaMU_t_local = gammaMU_t_local,
gammaP_t_local = gammaP_t_local,
small_alphaMatCurrent_tc = small_alphaMatCurrent_tc,
small_priorCurrent_m = small_priorCurrent_m,
smooth_cm = smooth_cm,
Expand Down Expand Up @@ -846,7 +858,7 @@ get_and_impute_one_sample <- function(
return_extra = FALSE,
return_genProbs = return_genProbs,
return_hapProbs = return_hapProbs,
return_gibbs_block_output = TRUE,
return_gibbs_block_output = FALSE,
gibbs_initialize_iteratively = gibbs_initialize_iteratively,
gibbs_initialize_at_first_read = FALSE,
maxDifferenceBetweenReads = maxDifferenceBetweenReads,
Expand Down Expand Up @@ -962,7 +974,7 @@ get_and_impute_one_sample <- function(

if (have_truth_haplotypes) {
w <- i_it + n_seek_its * (i_gibbs_sample - 1)
x <- calculate_pse_and_r2_during_gibbs(inRegion2 = inRegion2, hap1 = hap1, hap2 = hap2, truth_haps = truth_haps, af = af, verbose = FALSE)
x <- calculate_pse_and_r2_during_gibbs(inRegion2 = inRegion2, hap1 = hap1, hap2 = hap2, truth_haps = truth_haps, af = af, verbose = verbose)
pse_mat[w, ] <- c(i_gibbs_sample, i_it, as.integer(phasing_it), x)
}
}
Expand Down Expand Up @@ -1858,10 +1870,16 @@ impute_one_sample <- function(
small_transMatRate_tc_H,
alphaHat_t1,
betaHat_t1,
eMatGrid_t1,
alphaHat_t2,
betaHat_t2,
eMatGrid_t2,
alphaHat_t3,
betaHat_t3,
eMatGrid_t3,
gammaMT_t_local,
gammaMU_t_local,
gammaP_t_local,
small_alphaMatCurrent_tc,
small_priorCurrent_m,
smooth_cm,
Expand Down Expand Up @@ -1916,42 +1934,6 @@ impute_one_sample <- function(
##
K <- length(which_haps_to_use)
S <- 1

## print("TEMPORARY")
## print(paste0("/dev/shm/rwdavies/", sample_name, ".RData"))
## save(
## rhb_t,
## small_eHapsCurrent_tc,
## which_haps_to_use,
## nSNPs,
## ref_error,
## maxDifferenceBetweenReads,
## Jmax,
## hapMatcher,
## grid,
## distinctHapsIE,
## sampleReads,
## file = paste0("/dev/shm/rwdavies/", sample_name, ".RData"),
## compress = FALSE
## )
## stop("WER")
## load("~/temp.RData")
## temp_compare_two_versions(
## rhb_t,
## small_eHapsCurrent_tc,
## which_haps_to_use,
## nSNPs,
## ref_error,
## maxDifferenceBetweenReads,
## Jmax,
## hapMatcher,
## grid,
## distinctHapsIE,
## sampleReads
## )
## print("TEMPORARY")

## argh
## print(paste0("start = ", Sys.time()))
if (use_small_eHapsCurrent_tc) {
inflate_fhb_t_in_place(
Expand All @@ -1977,7 +1959,10 @@ impute_one_sample <- function(
## return_hapProbs = TRUE,
## return_gamma = TRUE,
## return_gibbs_block_output = FALSE,
## return_advanced_gibbs_block_output = FALSE
## return_advanced_gibbs_block_output = FALSE,
## use_starting_read_labels = FALSE,
## verbose = FALSE,
## run_fb_subset = FALSE
## )
param_list <- list(
return_alpha = FALSE,
Expand All @@ -1987,8 +1972,12 @@ impute_one_sample <- function(
return_hapProbs = return_hapProbs,
return_p_store = return_p_store,
return_gibbs_block_output = return_gibbs_block_output,
return_advanced_gibbs_block_output = return_advanced_gibbs_block_output
return_advanced_gibbs_block_output = return_advanced_gibbs_block_output,
use_starting_read_labels = TRUE,
verbose = verbose,
run_fb_subset = FALSE
)
##
out <- rcpp_forwardBackwardGibbsNIPT(
sampleReads = sampleReads,
priorCurrent_m = small_priorCurrent_m,
Expand All @@ -2004,7 +1993,6 @@ impute_one_sample <- function(
maxDifferenceBetweenReads = maxDifferenceBetweenReads,
Jmax = Jmax,
maxEmissionMatrixDifference = maxEmissionMatrixDifference,
run_fb_subset = FALSE,
run_fb_grid_offset = FALSE,
blocks_for_output = array(0, c(1, 1)),
grid = grid,
Expand All @@ -2015,20 +2003,24 @@ impute_one_sample <- function(
betaHat_t1 = betaHat_t1,
betaHat_t2 = betaHat_t2,
betaHat_t3 = betaHat_t3,
eMatGrid_t1 = eMatGrid_t1,
eMatGrid_t2 = eMatGrid_t2,
eMatGrid_t3 = eMatGrid_t3,
gammaMT_t_local = gammaMT_t_local,
gammaMU_t_local = gammaMU_t_local,
gammaP_t_local = gammaP_t_local,
hapSum_tc = array(0, c(1, 1, 1)),
snp_start_1_based = -1,
snp_end_1_based = -1,
generate_fb_snp_offsets = FALSE,
suppressOutput = suppressOutput,
n_gibbs_burn_in_its = n_gibbs_burn_in_its,
n_gibbs_sample_its = n_gibbs_sample_its,
use_starting_read_labels = TRUE,
n_gibbs_starts = n_gibbs_starts,
double_list_of_starting_read_labels = double_list_of_starting_read_labels,
prev_list_of_alphaBetaBlocks = as.list(c(1, 2)),
i_snp_block_for_alpha_beta = -1,
haploid_gibbs_equal_weighting = TRUE,
verbose = verbose,
gibbs_initialize_iteratively = gibbs_initialize_iteratively,
gibbs_initialize_at_first_read = gibbs_initialize_at_first_read, ## experiment with
do_block_resampling = FALSE, ## turn off for now
Expand Down
80 changes: 0 additions & 80 deletions QUILT/R/gibbs-small.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,83 +70,3 @@ R_make_eMatRead_t_for_gibbs_using_objects <- function(



temp_compare_two_versions <- function(
rhb_t,
small_eHapsCurrent_tc,
which_haps_to_use,
nSNPs,
ref_error,
maxDifferenceBetweenReads,
Jmax,
hapMatcher,
grid,
distinctHapsIE,
sampleReads
) {

K <- length(which_haps_to_use)
rescale_eMatRead_t <- TRUE
nReads <- length(sampleReads)

f1 <- function() {
##
S <- 1
## argh
## print(paste0("start = ", Sys.time()))
inflate_fhb_t_in_place(
rhb_t,
small_eHapsCurrent_tc,
haps_to_get = which_haps_to_use - 1,
nSNPs = nSNPs,
ref_error = ref_error
)
eMatRead_t_old <- array(1, c(K, nReads))
rcpp_make_eMatRead_t(
eMatRead_t = eMatRead_t_old,
sampleReads = sampleReads,
eHapsCurrent_tc = small_eHapsCurrent_tc,
s = 0,
maxDifferenceBetweenReads = maxDifferenceBetweenReads,
Jmax = Jmax,
eMatHapOri_t = matrix(0, nrow = 0, ncol = 0),
pRgivenH1 = vector(),
pRgivenH2 = vector(),
prev = 1,
suppressOutput = 1,
prev_section = "N/A",
next_section = "N/A",
run_pseudo_haploid = FALSE,
rescale_eMatRead_t = rescale_eMatRead_t
)
eMatRead_t_old
}


f2 <- function() {
eMatRead_t_new <- array(1, c(K, nReads))
Rcpp_make_eMatRead_t_for_gibbs_using_objects(
eMatRead_t = eMatRead_t_new,
sampleReads = sampleReads,
hapMatcher = hapMatcher,
grid = grid,
rhb_t = rhb_t,
distinctHapsIE = distinctHapsIE,
ref_error = ref_error,
which_haps_to_use = which_haps_to_use,
rescale_eMatRead_t = rescale_eMatRead_t,
Jmax = Jmax,
maxDifferenceBetweenReads = maxDifferenceBetweenReads
)
eMatRead_t_new
}

library("testthat")
expect_equal(f1(), f2())

library("microbenchmark")
## can be slower (on tall data, HRC sized)
## so can be faster (on wide data)
print(microbenchmark(f1(), f2(), times = 2))
## so def slower. hmm

}
21 changes: 18 additions & 3 deletions QUILT/R/quilt.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ QUILT <- function(
block_gibbs_iterations = c(3,6,9),
n_gibbs_burn_in_its = 20,
plot_per_sample_likelihoods = FALSE,
use_small_eHapsCurrent_tc = TRUE
use_small_eHapsCurrent_tc = FALSE
) {


Expand Down Expand Up @@ -575,10 +575,17 @@ QUILT <- function(
S <- 1
alphaHat_t1 <- array(0, c(K, nGrids))
betaHat_t1 <- array(0, c(K, nGrids))
eMatGrid_t1 <- array(0, c(K, nGrids))
alphaHat_t2 <- array(0, c(K, nGrids))
betaHat_t2 <- array(0, c(K, nGrids))
eMatGrid_t2 <- array(0, c(K, nGrids))
alphaHat_t3 <- array(0, c(K, nGrids))
betaHat_t3 <- array(0, c(K, nGrids))
eMatGrid_t3 <- array(0, c(K, nGrids))
gammaMT_t_local <- array(0, c(K, nGrids))
gammaMU_t_local <- array(0, c(K, nGrids))
gammaP_t_local <- array(0, c(K, nGrids))
##
small_priorCurrent_m <- array(1 / K, c(K, S))
small_alphaMatCurrent_tc <- array(1 / K, c(K, nGrids - 1, S))
if (use_small_eHapsCurrent_tc) {
Expand All @@ -587,6 +594,9 @@ QUILT <- function(
small_eHapsCurrent_tc <- array(0, c(1, 1, 1))
}




results_across_samples <- as.list(sampleRange[2] - sampleRange[1] + 1)

for(iSample in sampleRange[1]:sampleRange[2]) {
Expand All @@ -606,10 +616,16 @@ QUILT <- function(
small_transMatRate_tc_H = small_transMatRate_tc_H,
alphaHat_t1 = alphaHat_t1,
betaHat_t1 = betaHat_t1,
eMatGrid_t1 = eMatGrid_t1,
alphaHat_t2 = alphaHat_t2,
betaHat_t2 = betaHat_t2,
eMatGrid_t2 = eMatGrid_t2,
alphaHat_t3 = alphaHat_t3,
betaHat_t3 = betaHat_t3,
eMatGrid_t3 = eMatGrid_t3,
gammaMT_t_local = gammaMT_t_local,
gammaMU_t_local = gammaMU_t_local,
gammaP_t_local = gammaP_t_local,
small_alphaMatCurrent_tc = small_alphaMatCurrent_tc,
small_priorCurrent_m = small_priorCurrent_m,
small_eHapsCurrent_tc = small_eHapsCurrent_tc,
Expand Down Expand Up @@ -683,12 +699,11 @@ QUILT <- function(
}

## optionally, do some gc here, if longer running job
if (as.numeric(K) * as.numeric(nSNPs) > (1e8)) {
if (as.numeric(K) * as.numeric(nSNPs) > (1e6)) {
for(i in 1:5) {
gc(reset = TRUE)
}
}


}

Expand Down
16 changes: 14 additions & 2 deletions QUILT/R/reference-single.R
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,19 @@ make_rhb_t_equality <- function(
##
if (infer_nMaxDH) {
running_count <- cumsum(rowSums(temp_counter) / (nrow(rhb_t) * nGrids))
suggested_value <- which.max(running_count > 0.99)
## really need to tune this better
## basically, larger K, important to set large
if (K > 50000) {
thresh <- 0.9999
} else if (K > 10000) {
thresh <- 0.9995
} else if (K > 1000) {
thresh <- 0.999
} else {
thresh <- 0.99
}
## really want to almost never need this, within reason, for large K
suggested_value <- which.max(running_count > thresh)
nMaxDH <- min(
max(c(2 ** 4 - 1, suggested_value)),
nMaxDH_default
Expand All @@ -422,7 +434,7 @@ make_rhb_t_equality <- function(
print_message(paste0("Using nMaxDH = ", nMaxDH))
}
distinctHapsB <- distinctHapsB[1:nMaxDH, ]
hapMatcher[hapMatcher > (nMaxDH)] <- 0
hapMatcher[hapMatcher > (nMaxDH)] <- 0L
}
##
## inflate them too, they're pretty small
Expand Down
Loading

0 comments on commit 6bd7b0b

Please sign in to comment.