Merge pull request #315 from stemangiola/improve-scaling-robustness

use matrix multiplication
stemangiola · Sep 17, 2024 · 407a482 · 407a482
2 parents 9c9edcc + d27e6a0
commit 407a482
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 24 deletions.
diff --git a/R/functions_SE.R b/R/functions_SE.R
@@ -744,9 +744,9 @@ get_differential_transcript_abundance_bulk_SE <- function(.data,
 	# Replace `:` with ___ because it creates error with edgeR
 	if(design |> colnames() |> str_detect(":") |> any()) {
 	  message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.")
-	  colnames(design) = design |> colnames() |> str_replace(":", "___") 
+	  colnames(design) = design |> colnames() |> str_replace(":", "___")
 	}
-	
+
 	# Print the design column names in case I want contrasts
 	message(
 		sprintf(
@@ -1070,6 +1070,7 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data,
 #' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
 #' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT)
 #' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")
+#' @param .scaling_factor A tidyeval (column name) for the precalculated TMM scaling
 #' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames.
 #' @param ... Additional arguments for glmmSeq
 #'
@@ -1085,13 +1086,15 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
                                                             test_above_log2_fold_change = NULL,
 
                                                             scaling_method = "TMM",
+                                                            .scaling_factor = NULL,
                                                             omit_contrast_in_colnames = FALSE,
                                                             prefix = "",
                                                             .dispersion = NULL,
                                                             ...) {
 
   .abundance = enquo(.abundance)
   .dispersion = enquo(.dispersion)
+  .scaling_factor = enquo(.scaling_factor)
 
   # Check if contrasts are of the same form
   if(
@@ -1146,7 +1149,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
       data = metadata
     )
 
-  if(quo_is_symbolic(.dispersion))
+  if(.dispersion |> quo_is_symbolic())
     dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe()
   else
     dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts))
@@ -1161,7 +1164,10 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
   dispersion = dispersion[rownames(counts)]
 
   # Scaling
-  sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
+  if(.scaling_factor |> quo_is_symbolic())
+    sizeFactors = .data |> pivot_sample() |> pull(!!.scaling_factor)
+  else
+    sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
 
 
   glmmSeq_object =

diff --git a/R/methods.R b/R/methods.R
@@ -594,14 +594,14 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance)
 #' @details Tranform the feature abundance across samples so to have the same quantile distribution (using preprocessCore).
 #'
 #' Underlying method
-#' 
+#'
 #' If `limma_normalize_quantiles` is chosen
-#' 
+#'
 #' .data |>limma::normalizeQuantiles()
-#'  
+#'
 #'  If `preprocesscore_normalize_quantiles_use_target` is chosen
-#'  
-#' .data |> 
+#'
+#' .data |>
 #'    preprocessCore::normalize.quantiles.use.target(
 #'       target = preprocessCore::normalize.quantiles.determine.target(.data)
 #'    )
@@ -638,7 +638,7 @@ setGeneric("quantile_normalise_abundance", function(.data,
                                           .abundance = NULL,
                                           method = "limma_normalize_quantiles",
                                           target_distribution = NULL,
-                                          
+
                                           action = "add")
 {
 
@@ -695,7 +695,7 @@ setGeneric("quantile_normalise_abundance", function(.data,
     }
 
     if(is.null(target_distribution)) target_distribution = preprocessCore::normalize.quantiles.determine.target(.data_norm)
-    
+
     .data_norm_quant =
       .data_norm |>
       preprocessCore::normalize.quantiles.use.target(
@@ -2604,7 +2604,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #' @param significance_threshold DEPRECATED - A real between 0 and 1 (usually 0.05).
 #' @param fill_missing_values DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed.
 #' @param .contrasts DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)
-#' @param ... Further arguments passed to some of the internal functions. Currently, it is needed just for internal debug.
+#' @param ... Further arguments passed to some of the internal experimental functions. For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. .scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance.
 #'
 #'
 #' @details This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or  DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing.

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -188,17 +188,16 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se)
 
 	my_counts_scaled =
 		list(
-			assays(.data) %>%
-				as.list() %>%
-				.[[1]] %>%
-				multiply_by(
-					rep(multiplier, rep(nrow(.),length(multiplier)))
-				)
+		  assay(.data) %*%
+				diag(multiplier)
+
 			) %>%
 		setNames(value_scaled)
+  colnames(my_counts_scaled[[1]]) = assay(.data)  |> colnames()
+
 
 	# Add the assay
-	assays(.data) =  assays(.data) %>% c(my_counts_scaled)
+	assays(.data, withDimnames=FALSE) =  assays(.data) %>% c(my_counts_scaled)
 
 	.data %>%
 
@@ -313,7 +312,7 @@ setMethod("scale_abundance",
       as.matrix()
 
     if(is.null(target_distribution)) target_distribution = preprocessCore::normalize.quantiles.determine.target(.data_norm)
-    
+
     .data_norm =
       .data_norm |>
       preprocessCore::normalize.quantiles.use.target(

diff --git a/man/quantile_normalise_abundance-methods.Rd b/man/quantile_normalise_abundance-methods.Rd
diff --git a/man/test_differential_abundance-methods.Rd b/man/test_differential_abundance-methods.Rd