diff --git a/DESCRIPTION b/DESCRIPTION index 69faffb..cb60ba9 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SimBu Title: Simulate Bulk RNA-seq Datasets from Single-Cell Datasets -Version: 1.5.2 +Version: 1.5.3 Authors@R: person(given = "Alexander", family = "Dietrich", diff --git a/NEWS b/NEWS index 671b03d..5ab3a9b 100755 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +Changes in version 1.5.2 (2023-11-21) ++ Added 'seed' parameter to simulation + Changes in version 1.5.2 (2023-11-08) + Updated SimBu to be compatibel with Seurat v5 diff --git a/R/simulator.R b/R/simulator.R index 997e151..c129d74 100755 --- a/R/simulator.R +++ b/R/simulator.R @@ -38,6 +38,7 @@ NULL #' @param remove_bias_in_counts boolean; if TRUE (default) the internal mRNA bias that is present in count data will be *removed* using the number of reads mapped to each cell #' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed #' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE +#' @param seed numeric; fix this value if you want the same cells to be sampled #' #' @return returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset #' @@ -49,7 +50,8 @@ simulate_sample <- function(data, total_read_counts, remove_bias_in_counts, remove_bias_in_counts_method, - norm_counts) { + norm_counts, + seed) { if (!all(names(simulation_vector) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) { stop("Some cell-types in the provided simulation vector are not in the annotation.") } @@ -72,6 +74,10 @@ simulate_sample <- function(data, if (n == 0) { n <- 1 } + if(!is.na(seed)){ + # fix seed for random selection of cells + set.seed(seed) + } cells <- dplyr::slice_sample(cells_of_type_x, n = n, replace = TRUE) cells <- cells[["cell_ID"]] } @@ -169,6 +175,7 @@ simulate_sample <- function(data, #' @param total_read_counts numeric; sets the total read count value for each sample #' @param whitelist list; give a list of cell-types you want to keep for the simulation; if NULL, all are used #' @param blacklist list; give a list of cell-types you want to remove for the simulation; if NULL, all are used; is applied after whitelist +#' @param seed numeric; specifiy a seed for RNG. This effects cell sampling; with a fixed seed you will always sample the same cells for each sample (seed value is incrased by 1 for each sample). Default = NA (two simulation runs will sample different cells). #' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x) #' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default #' @@ -270,6 +277,7 @@ simulate_bulk <- function(data, total_read_counts = NULL, whitelist = NULL, blacklist = NULL, + seed = NA, BPPARAM = BiocParallel::bpparam(), run_parallel = FALSE) { # switch multi-threading on/off @@ -428,19 +436,22 @@ simulate_bulk <- function(data, ##### generate the samples ##### # sample cells and generate pseudo-bulk profiles - all_samples <- BiocParallel::bplapply(simulation_vector_list, function(x) { - samples <- simulate_sample( + all_samples <- BiocParallel::bplapply(seq_along(simulation_vector_list), function(i) { + + simulation_vector <- simulation_vector_list[[i]] + sample <- simulate_sample( data = data, scaling_vector = scaling_vector, - simulation_vector = x, + simulation_vector = simulation_vector, total_cells = ncells, total_read_counts = total_read_counts, remove_bias_in_counts = remove_bias_in_counts, remove_bias_in_counts_method = remove_bias_in_counts_method, - norm_counts = norm_counts + norm_counts = norm_counts, + seed = ifelse(is.na(seed), seed, seed + i) # ensure that samples still are different if seed is set ) - return(samples) + return(sample) }, BPPARAM = BPPARAM) bulk_counts <- Matrix::Matrix(vapply( diff --git a/man/simulate_bulk.Rd b/man/simulate_bulk.Rd index 69723f5..41576b3 100755 --- a/man/simulate_bulk.Rd +++ b/man/simulate_bulk.Rd @@ -24,6 +24,7 @@ simulate_bulk( total_read_counts = NULL, whitelist = NULL, blacklist = NULL, + seed = NA, BPPARAM = BiocParallel::bpparam(), run_parallel = FALSE ) @@ -65,6 +66,8 @@ simulate_bulk( \item{blacklist}{list; give a list of cell-types you want to remove for the simulation; if NULL, all are used; is applied after whitelist} +\item{seed}{numeric; specifiy a seed for RNG. This effects cell sampling; with a fixed seed you will always sample the same cells for each sample (seed value is incrased by 1 for each sample). Default = NA (two simulation runs will sample different cells).} + \item{BPPARAM}{BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)} \item{run_parallel}{boolean, decide if multi-threaded calculation will be run. FALSE by default} diff --git a/man/simulate_sample.Rd b/man/simulate_sample.Rd index 20724f9..1ad32b5 100755 --- a/man/simulate_sample.Rd +++ b/man/simulate_sample.Rd @@ -12,7 +12,8 @@ simulate_sample( total_read_counts, remove_bias_in_counts, remove_bias_in_counts_method, - norm_counts + norm_counts, + seed ) } \arguments{ @@ -31,6 +32,8 @@ simulate_sample( \item{remove_bias_in_counts_method}{'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed} \item{norm_counts}{boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE} + +\item{seed}{numeric; fix this value if you want the same cells to be sampled} } \value{ returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset diff --git a/tests/testthat/test_simulator.R b/tests/testthat/test_simulator.R index aad63fe..d7283c2 100755 --- a/tests/testthat/test_simulator.R +++ b/tests/testthat/test_simulator.R @@ -39,8 +39,25 @@ test_that("can generate different simulation scenarios & check multi threads", { testthat::expect_s4_class(SimBu::simulate_bulk(data = dataset, scenario = "weighted", weighted_cell_type = "T cells CD4", weighted_amount = .5, scaling_factor = "NONE", nsamples = 10, ncells = 100, run_parallel = FALSE)[["bulk"]], "SummarizedExperiment") testthat::expect_s4_class(SimBu::simulate_bulk(data = dataset, scenario = "even", scaling_factor = "NONE", nsamples = 10, ncells = 100, BPPARAM = BiocParallel::MulticoreParam(workers = 2), run_parallel = FALSE)[["bulk"]], "SummarizedExperiment") + testthat::expect_s4_class(SimBu::simulate_bulk(data = dataset, scenario = "even", scaling_factor = "NONE", nsamples = 10, ncells = 100, total_read_counts = 100000, BPPARAM = BiocParallel::MulticoreParam(workers = 2), run_parallel = FALSE)[["bulk"]], "SummarizedExperiment") }) +test_that("test RNG", { + # use even scenario with no variance between samples + # with fixed seed, both simulations have exactly the same count values + seed <- 123 + sim1 <- SimBu::simulate_bulk(data = dataset, scenario = "even", scaling_factor = "NONE", balance_even_mirror_scenario = 0, nsamples = 10, ncells = 100, run_parallel = FALSE, seed = seed) + sim2 <- SimBu::simulate_bulk(data = dataset, scenario = "even", scaling_factor = "NONE", balance_even_mirror_scenario = 0, nsamples = 10, ncells = 100, run_parallel = FALSE, seed = seed) + x1 <- rowSums(assays(sim1$bulk)[['bulk_counts']]) + x2 <- rowSums(assays(sim2$bulk)[['bulk_counts']]) + testthat::expect_equal(x1,x2) + + # test that samples inside one simulation still are different + sample1 <- sim1$bulk[,1] + sample2 <- sim1$bulk[,2] + testthat::expect_false(all(assays(sample1)[['bulk_counts']] == assays(sample2)[['bulk_counts']])) + +}) test_that("test different scaling factor calculations + mRNA bias removal from counts", { testthat::expect_s4_class(SimBu::simulate_bulk(data = dataset, scenario = "random", scaling_factor = "census", nsamples = 10, ncells = 100, run_parallel = FALSE)[["bulk"]], "SummarizedExperiment")