0.1.7 speedups and RAM decrease

rwdavies · Apr 22, 2021 · 6bd7b0b · 6bd7b0b
1 parent cd684f0
commit 6bd7b0b
Show file tree

Hide file tree

Showing 15 changed files with 431 additions and 192 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,5 @@
+* v0.1.7
+	* Speedups and RAM decrease when using reference panels with fewer haplotypes
 * v0.1.6
 	* Discovered bug where only 1 read caused an error. Fix (obviate) this by requiring 2 of more reads through parameter minimum_number_of_sample_reads with default currently 2, where samples with fewer than 2 reads have all missing output
 * v0.1.5

diff --git a/QUILT.R b/QUILT.R
@@ -388,9 +388,9 @@ option_list <- list(
     ), 
     make_option(
         "--use_small_eHapsCurrent_tc",
-        type = "integer",
-        help = "For testing purposes only [default TRUE] ",
-        default = TRUE
+        type = "logical",
+        help = "For testing purposes only [default FALSE] ",
+        default = FALSE
     )
 )
 opt <- suppressWarnings(parse_args(OptionParser(option_list = option_list)))

diff --git a/QUILT/DESCRIPTION b/QUILT/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: QUILT
 Type: Package
 Title: QUILT
-Version: 0.1.6
-Date: 2021-04-09
+Version: 0.1.7
+Date: 2021-04-22
 Author: Robert William Davies
 Maintainer: Robert William Davies <[email protected]>
 Description: QUILT performs imputation of individuals sequenced to low coverage

diff --git a/QUILT/R/RcppExports.R b/QUILT/R/RcppExports.R
@@ -182,8 +182,8 @@ rcpp_fly_weighter <- function(i, gCurrent, gAverage, ll_current, log_mult, ll_re
 }
 
 #' @export
-rcpp_forwardBackwardGibbsNIPT <- function(sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local = 100L, maxDifferenceBetweenReads = 1000, maxEmissionMatrixDifference = 10000000000, run_fb_subset = FALSE, run_fb_grid_offset = 0L, grid = 0L, snp_start_1_based = -1L, snp_end_1_based = -1L, generate_fb_snp_offsets = FALSE, suppressOutput = 1L, n_gibbs_starts = 1L, n_gibbs_sample_its = 1L, n_gibbs_burn_in_its = 1L, use_starting_read_labels = FALSE, verbose = FALSE, double_list_of_starting_read_labels = NULL, seed_vector = -1L, prev_list_of_alphaBetaBlocks = NULL, i_snp_block_for_alpha_beta = 1L, haploid_gibbs_equal_weighting = TRUE, gibbs_initialize_iteratively = FALSE, gibbs_initialize_at_first_read = TRUE, do_block_resampling = FALSE, artificial_relabel = -1L, pass_in_alphaBeta = FALSE, update_in_place = FALSE, update_hapSum = FALSE, record_read_set = FALSE, class_sum_cutoff = 0.06, perform_block_gibbs = FALSE, shuffle_bin_radius = 5000L, block_gibbs_iterations = as.integer( c(0)), rescale_eMatRead_t = TRUE, use_smooth_cm_in_block_gibbs = FALSE, block_gibbs_quantile_prob = 0.9, do_shard_ff0_block_gibbs = TRUE, use_small_eHapsCurrent_tc = TRUE) {
-    .Call('_QUILT_rcpp_forwardBackwardGibbsNIPT', PACKAGE = 'QUILT', sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local, maxDifferenceBetweenReads, maxEmissionMatrixDifference, run_fb_subset, run_fb_grid_offset, grid, snp_start_1_based, snp_end_1_based, generate_fb_snp_offsets, suppressOutput, n_gibbs_starts, n_gibbs_sample_its, n_gibbs_burn_in_its, use_starting_read_labels, verbose, double_list_of_starting_read_labels, seed_vector, prev_list_of_alphaBetaBlocks, i_snp_block_for_alpha_beta, haploid_gibbs_equal_weighting, gibbs_initialize_iteratively, gibbs_initialize_at_first_read, do_block_resampling, artificial_relabel, pass_in_alphaBeta, update_in_place, update_hapSum, record_read_set, class_sum_cutoff, perform_block_gibbs, shuffle_bin_radius, block_gibbs_iterations, rescale_eMatRead_t, use_smooth_cm_in_block_gibbs, block_gibbs_quantile_prob, do_shard_ff0_block_gibbs, use_small_eHapsCurrent_tc)
+rcpp_forwardBackwardGibbsNIPT <- function(sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, eMatGrid_t1, eMatGrid_t2, eMatGrid_t3, gammaMT_t_local, gammaMU_t_local, gammaP_t_local, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local = 100L, maxDifferenceBetweenReads = 1000, maxEmissionMatrixDifference = 10000000000, run_fb_grid_offset = 0L, grid = 0L, snp_start_1_based = -1L, snp_end_1_based = -1L, generate_fb_snp_offsets = FALSE, suppressOutput = 1L, n_gibbs_starts = 1L, n_gibbs_sample_its = 1L, n_gibbs_burn_in_its = 1L, double_list_of_starting_read_labels = NULL, seed_vector = -1L, prev_list_of_alphaBetaBlocks = NULL, i_snp_block_for_alpha_beta = 1L, haploid_gibbs_equal_weighting = TRUE, gibbs_initialize_iteratively = FALSE, gibbs_initialize_at_first_read = TRUE, do_block_resampling = FALSE, artificial_relabel = -1L, pass_in_alphaBeta = FALSE, update_in_place = FALSE, update_hapSum = FALSE, record_read_set = FALSE, class_sum_cutoff = 0.06, perform_block_gibbs = FALSE, shuffle_bin_radius = 5000L, block_gibbs_iterations = as.integer( c(0)), rescale_eMatRead_t = TRUE, use_smooth_cm_in_block_gibbs = FALSE, block_gibbs_quantile_prob = 0.9, do_shard_ff0_block_gibbs = TRUE, use_small_eHapsCurrent_tc = TRUE) {
+    .Call('_QUILT_rcpp_forwardBackwardGibbsNIPT', PACKAGE = 'QUILT', sampleReads, priorCurrent_m, alphaMatCurrent_tc, eHapsCurrent_tc, transMatRate_tc_H, ff, blocks_for_output, alphaHat_t1, betaHat_t1, alphaHat_t2, betaHat_t2, alphaHat_t3, betaHat_t3, eMatGrid_t1, eMatGrid_t2, eMatGrid_t3, gammaMT_t_local, gammaMU_t_local, gammaP_t_local, hapSum_tc, hapMatcher, distinctHapsIE, rhb_t, ref_error, which_haps_to_use, wif0, grid_has_read, L_grid, smooth_cm, param_list, Jmax_local, maxDifferenceBetweenReads, maxEmissionMatrixDifference, run_fb_grid_offset, grid, snp_start_1_based, snp_end_1_based, generate_fb_snp_offsets, suppressOutput, n_gibbs_starts, n_gibbs_sample_its, n_gibbs_burn_in_its, double_list_of_starting_read_labels, seed_vector, prev_list_of_alphaBetaBlocks, i_snp_block_for_alpha_beta, haploid_gibbs_equal_weighting, gibbs_initialize_iteratively, gibbs_initialize_at_first_read, do_block_resampling, artificial_relabel, pass_in_alphaBeta, update_in_place, update_hapSum, record_read_set, class_sum_cutoff, perform_block_gibbs, shuffle_bin_radius, block_gibbs_iterations, rescale_eMatRead_t, use_smooth_cm_in_block_gibbs, block_gibbs_quantile_prob, do_shard_ff0_block_gibbs, use_small_eHapsCurrent_tc)
 }
 
 #' @export

diff --git a/QUILT/R/functions.R b/QUILT/R/functions.R
@@ -467,10 +467,16 @@ get_and_impute_one_sample <- function(
     small_transMatRate_tc_H,
     alphaHat_t1,
     betaHat_t1,
+    eMatGrid_t1,
     alphaHat_t2,
     betaHat_t2,
+    eMatGrid_t2,    
     alphaHat_t3,
     betaHat_t3,
+    eMatGrid_t3,
+    gammaMT_t_local,
+    gammaMU_t_local,
+    gammaP_t_local,
     small_alphaMatCurrent_tc,
     small_priorCurrent_m,
     small_eHapsCurrent_tc,
@@ -811,10 +817,16 @@ get_and_impute_one_sample <- function(
                 small_transMatRate_tc_H = small_transMatRate_tc_H,
                 alphaHat_t1 = alphaHat_t1,
                 betaHat_t1 = betaHat_t1,
+                eMatGrid_t1 = eMatGrid_t1,                
                 alphaHat_t2 = alphaHat_t2,
                 betaHat_t2 = betaHat_t2,
+                eMatGrid_t2 = eMatGrid_t2,
                 alphaHat_t3 = alphaHat_t3,
                 betaHat_t3 = betaHat_t3,
+                eMatGrid_t3 = eMatGrid_t3,
+                gammaMT_t_local = gammaMT_t_local,
+                gammaMU_t_local = gammaMU_t_local,
+                gammaP_t_local = gammaP_t_local,
                 small_alphaMatCurrent_tc = small_alphaMatCurrent_tc,
                 small_priorCurrent_m = small_priorCurrent_m,
                 smooth_cm = smooth_cm,
@@ -846,7 +858,7 @@ get_and_impute_one_sample <- function(
                 return_extra = FALSE,
                 return_genProbs = return_genProbs,
                 return_hapProbs = return_hapProbs,
-                return_gibbs_block_output = TRUE,
+                return_gibbs_block_output = FALSE,
                 gibbs_initialize_iteratively = gibbs_initialize_iteratively,
                 gibbs_initialize_at_first_read = FALSE,
                 maxDifferenceBetweenReads = maxDifferenceBetweenReads,
@@ -962,7 +974,7 @@ get_and_impute_one_sample <- function(
 
             if (have_truth_haplotypes) {
                 w <- i_it + n_seek_its * (i_gibbs_sample - 1)
-                x <- calculate_pse_and_r2_during_gibbs(inRegion2 = inRegion2, hap1 = hap1, hap2 = hap2, truth_haps = truth_haps, af = af, verbose = FALSE)
+                x <- calculate_pse_and_r2_during_gibbs(inRegion2 = inRegion2, hap1 = hap1, hap2 = hap2, truth_haps = truth_haps, af = af, verbose = verbose)
                 pse_mat[w, ] <- c(i_gibbs_sample, i_it, as.integer(phasing_it), x)
             }
         }
@@ -1858,10 +1870,16 @@ impute_one_sample <- function(
     small_transMatRate_tc_H,
     alphaHat_t1,
     betaHat_t1,
+    eMatGrid_t1,
     alphaHat_t2,
     betaHat_t2,
+    eMatGrid_t2,    
     alphaHat_t3,
     betaHat_t3,
+    eMatGrid_t3,
+    gammaMT_t_local,
+    gammaMU_t_local,
+    gammaP_t_local,
     small_alphaMatCurrent_tc,
     small_priorCurrent_m,
     smooth_cm,
@@ -1916,42 +1934,6 @@ impute_one_sample <- function(
     ##
     K <- length(which_haps_to_use)
     S <- 1
-
-    ## print("TEMPORARY")
-    ## print(paste0("/dev/shm/rwdavies/", sample_name, ".RData"))
-    ## save(
-    ##     rhb_t,
-    ##     small_eHapsCurrent_tc,
-    ##     which_haps_to_use,
-    ##     nSNPs,
-    ##     ref_error,
-    ##     maxDifferenceBetweenReads,
-    ##     Jmax,
-    ##     hapMatcher,
-    ##     grid,
-    ##     distinctHapsIE,
-    ##     sampleReads,
-    ##     file = paste0("/dev/shm/rwdavies/", sample_name, ".RData"),
-    ##     compress = FALSE
-    ## )
-    ## stop("WER")
-    ## load("~/temp.RData")
-    ## temp_compare_two_versions(
-    ##         rhb_t,
-    ##         small_eHapsCurrent_tc,
-    ##         which_haps_to_use,
-    ##         nSNPs,
-    ##         ref_error,
-    ##         maxDifferenceBetweenReads,
-    ##         Jmax,
-    ##         hapMatcher,
-    ##         grid,
-    ##     distinctHapsIE,
-    ##     sampleReads
-    ## )
-    ## print("TEMPORARY")
-
-    ## argh
     ## print(paste0("start = ", Sys.time()))
     if (use_small_eHapsCurrent_tc) {
         inflate_fhb_t_in_place(
@@ -1977,7 +1959,10 @@ impute_one_sample <- function(
     ##     return_hapProbs = TRUE,
     ##     return_gamma = TRUE,
     ##     return_gibbs_block_output = FALSE,
-    ##     return_advanced_gibbs_block_output = FALSE
+    ##     return_advanced_gibbs_block_output = FALSE,
+    ##     use_starting_read_labels = FALSE,
+    ##     verbose = FALSE,
+    ##     run_fb_subset = FALSE
     ## )
     param_list <- list(
         return_alpha = FALSE,
@@ -1987,8 +1972,12 @@ impute_one_sample <- function(
         return_hapProbs = return_hapProbs,
         return_p_store = return_p_store,
         return_gibbs_block_output = return_gibbs_block_output,
-        return_advanced_gibbs_block_output = return_advanced_gibbs_block_output
+        return_advanced_gibbs_block_output = return_advanced_gibbs_block_output,
+        use_starting_read_labels = TRUE,
+        verbose = verbose,
+        run_fb_subset = FALSE
     )
+    ## 
     out <- rcpp_forwardBackwardGibbsNIPT(
         sampleReads = sampleReads,
         priorCurrent_m = small_priorCurrent_m,        
@@ -2004,7 +1993,6 @@ impute_one_sample <- function(
         maxDifferenceBetweenReads = maxDifferenceBetweenReads,
         Jmax = Jmax,
         maxEmissionMatrixDifference = maxEmissionMatrixDifference,
-        run_fb_subset = FALSE,
         run_fb_grid_offset = FALSE,
         blocks_for_output = array(0, c(1, 1)),
         grid = grid,
@@ -2015,20 +2003,24 @@ impute_one_sample <- function(
         betaHat_t1 = betaHat_t1,
         betaHat_t2 = betaHat_t2,
         betaHat_t3 = betaHat_t3,
+        eMatGrid_t1 = eMatGrid_t1,
+        eMatGrid_t2 = eMatGrid_t2,
+        eMatGrid_t3 = eMatGrid_t3,
+        gammaMT_t_local = gammaMT_t_local,
+        gammaMU_t_local = gammaMU_t_local,
+        gammaP_t_local = gammaP_t_local,
         hapSum_tc = array(0, c(1, 1, 1)),
         snp_start_1_based = -1,
         snp_end_1_based = -1,
         generate_fb_snp_offsets = FALSE,
         suppressOutput = suppressOutput,
         n_gibbs_burn_in_its = n_gibbs_burn_in_its,
         n_gibbs_sample_its = n_gibbs_sample_its,            
-        use_starting_read_labels = TRUE,
         n_gibbs_starts = n_gibbs_starts,
         double_list_of_starting_read_labels = double_list_of_starting_read_labels,
         prev_list_of_alphaBetaBlocks = as.list(c(1, 2)),
         i_snp_block_for_alpha_beta = -1,
         haploid_gibbs_equal_weighting = TRUE,
-        verbose = verbose,
         gibbs_initialize_iteratively = gibbs_initialize_iteratively,
         gibbs_initialize_at_first_read = gibbs_initialize_at_first_read, ## experiment with
         do_block_resampling = FALSE, ## turn off for now

diff --git a/QUILT/R/gibbs-small.R b/QUILT/R/gibbs-small.R
@@ -70,83 +70,3 @@ R_make_eMatRead_t_for_gibbs_using_objects <- function(
 
 
 
-temp_compare_two_versions <- function(
-            rhb_t,
-            small_eHapsCurrent_tc,
-            which_haps_to_use,
-            nSNPs,
-            ref_error,
-            maxDifferenceBetweenReads,
-            Jmax,
-            hapMatcher,
-            grid,
-            distinctHapsIE,
-            sampleReads
-) {
-
-    K <- length(which_haps_to_use)
-    rescale_eMatRead_t <- TRUE
-    nReads <- length(sampleReads)
-
-    f1 <- function() {
-        ##
-        S <- 1
-        ## argh
-        ## print(paste0("start = ", Sys.time()))
-        inflate_fhb_t_in_place(
-            rhb_t,
-            small_eHapsCurrent_tc,
-            haps_to_get = which_haps_to_use - 1,
-            nSNPs = nSNPs,
-            ref_error = ref_error
-        )
-        eMatRead_t_old <- array(1, c(K, nReads))        
-        rcpp_make_eMatRead_t(
-            eMatRead_t = eMatRead_t_old,
-            sampleReads = sampleReads,
-            eHapsCurrent_tc = small_eHapsCurrent_tc,
-            s = 0,
-            maxDifferenceBetweenReads = maxDifferenceBetweenReads,
-            Jmax = Jmax,
-            eMatHapOri_t = matrix(0, nrow = 0, ncol = 0),
-            pRgivenH1 = vector(),
-            pRgivenH2 = vector(),
-            prev = 1,
-            suppressOutput = 1,
-            prev_section = "N/A",
-            next_section = "N/A",
-            run_pseudo_haploid = FALSE,
-            rescale_eMatRead_t = rescale_eMatRead_t
-        )
-        eMatRead_t_old
-    }
-
-
-    f2 <- function() {
-        eMatRead_t_new <- array(1, c(K, nReads))
-        Rcpp_make_eMatRead_t_for_gibbs_using_objects(
-            eMatRead_t = eMatRead_t_new,
-            sampleReads = sampleReads,
-            hapMatcher = hapMatcher,
-            grid = grid,
-            rhb_t = rhb_t,
-            distinctHapsIE = distinctHapsIE,
-            ref_error = ref_error,
-            which_haps_to_use = which_haps_to_use,
-            rescale_eMatRead_t = rescale_eMatRead_t,
-            Jmax = Jmax,
-            maxDifferenceBetweenReads = maxDifferenceBetweenReads
-        )
-        eMatRead_t_new
-    }
-
-    library("testthat")
-    expect_equal(f1(), f2())
-
-    library("microbenchmark")
-    ## can be slower (on tall data, HRC sized)
-    ## so can be faster (on wide data)
-    print(microbenchmark(f1(), f2(), times = 2))
-    ## so def slower. hmm
-
-}
diff --git a/QUILT/R/quilt.R b/QUILT/R/quilt.R
@@ -133,7 +133,7 @@ QUILT <- function(
     block_gibbs_iterations = c(3,6,9),
     n_gibbs_burn_in_its = 20,
     plot_per_sample_likelihoods = FALSE,
-    use_small_eHapsCurrent_tc = TRUE
+    use_small_eHapsCurrent_tc = FALSE
 ) {
 
 
@@ -575,10 +575,17 @@ QUILT <- function(
         S <- 1
         alphaHat_t1 <- array(0, c(K, nGrids))
         betaHat_t1 <- array(0, c(K, nGrids))
+        eMatGrid_t1 <- array(0, c(K, nGrids))
         alphaHat_t2 <- array(0, c(K, nGrids))
         betaHat_t2 <- array(0, c(K, nGrids))
+        eMatGrid_t2 <- array(0, c(K, nGrids))        
         alphaHat_t3 <- array(0, c(K, nGrids))
         betaHat_t3 <- array(0, c(K, nGrids))
+        eMatGrid_t3 <- array(0, c(K, nGrids))
+        gammaMT_t_local <- array(0, c(K, nGrids))
+        gammaMU_t_local <- array(0, c(K, nGrids))
+        gammaP_t_local <- array(0, c(K, nGrids))
+        ## 
         small_priorCurrent_m <- array(1 / K, c(K, S))
         small_alphaMatCurrent_tc <- array(1 / K, c(K, nGrids - 1, S))
         if (use_small_eHapsCurrent_tc) {
@@ -587,6 +594,9 @@ QUILT <- function(
             small_eHapsCurrent_tc <- array(0, c(1, 1, 1))
         }
 
+
+
+
         results_across_samples <- as.list(sampleRange[2] - sampleRange[1] + 1)
 
         for(iSample in sampleRange[1]:sampleRange[2]) {
@@ -606,10 +616,16 @@ QUILT <- function(
                 small_transMatRate_tc_H = small_transMatRate_tc_H,
                 alphaHat_t1 = alphaHat_t1,
                 betaHat_t1 = betaHat_t1,
+                eMatGrid_t1 = eMatGrid_t1,                
                 alphaHat_t2 = alphaHat_t2,
                 betaHat_t2 = betaHat_t2,
+                eMatGrid_t2 = eMatGrid_t2,
                 alphaHat_t3 = alphaHat_t3,
                 betaHat_t3 = betaHat_t3,
+                eMatGrid_t3 = eMatGrid_t3,
+                gammaMT_t_local = gammaMT_t_local,
+                gammaMU_t_local = gammaMU_t_local,
+                gammaP_t_local = gammaP_t_local,
                 small_alphaMatCurrent_tc = small_alphaMatCurrent_tc,
                 small_priorCurrent_m = small_priorCurrent_m,
                 small_eHapsCurrent_tc = small_eHapsCurrent_tc,
@@ -683,12 +699,11 @@ QUILT <- function(
             }
 
             ## optionally, do some gc here, if longer running job
-            if (as.numeric(K) * as.numeric(nSNPs) > (1e8)) {
+            if (as.numeric(K) * as.numeric(nSNPs) > (1e6)) {
                 for(i in 1:5) {
                     gc(reset = TRUE)
                 }
             }
-
 
         }
 

diff --git a/QUILT/R/reference-single.R b/QUILT/R/reference-single.R
@@ -413,7 +413,19 @@ make_rhb_t_equality <- function(
     ##
     if (infer_nMaxDH) {
         running_count <- cumsum(rowSums(temp_counter) / (nrow(rhb_t) * nGrids))
-        suggested_value <- which.max(running_count > 0.99)
+        ## really need to tune this better
+        ## basically, larger K, important to set large
+        if (K > 50000) {
+            thresh <- 0.9999
+        } else if (K > 10000) {
+            thresh <- 0.9995
+        } else if (K > 1000) {
+            thresh <- 0.999
+        } else {
+            thresh <- 0.99
+        }
+        ## really want to almost never need this, within reason, for large K
+        suggested_value <- which.max(running_count > thresh)
         nMaxDH <- min(
             max(c(2 ** 4 - 1, suggested_value)),
             nMaxDH_default
@@ -422,7 +434,7 @@ make_rhb_t_equality <- function(
             print_message(paste0("Using nMaxDH = ", nMaxDH))
         }
         distinctHapsB <- distinctHapsB[1:nMaxDH, ]
-        hapMatcher[hapMatcher > (nMaxDH)] <- 0
+        hapMatcher[hapMatcher > (nMaxDH)] <- 0L
     }
     ##
     ## inflate them too, they're pretty small