Merge pull request #103 from FertigLab/102-patternmarkers-all-returni…

…ng-empty-lists - rewrite method for `threshold="all"` - fix passing custom `lp` - improve documentation - remove `threshold="cut"` until it's clear what I does
FertigLab · Jun 5, 2024 · 45fdd42 · 45fdd42
2 parents f6287a2 + aea5f7b
commit 45fdd42
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 143 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: CoGAPS
-Version: 3.24.1
-Date: 2024-03-22
+Version: 3.25.0
+Date: 2024-06-05
 Title: Coordinated Gene Activity in Pattern Sets
 Author: Jeanette Johnson, Ashley Tsang, Jacob Mitchell, Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey,
     Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield,
@@ -52,7 +52,7 @@ biocViews: GeneExpression, Transcription, GeneSetEnrichment,
     DifferentialExpression, Bayesian, Clustering, TimeCourse, RNASeq, Microarray,
     MultipleComparison, DimensionReduction, ImmunoOncology
 NeedsCompilation: yes
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Collate:
     'class-CogapsParams.R'

diff --git a/R/class-CogapsResult.R b/R/class-CogapsResult.R
@@ -393,18 +393,17 @@ Pw=rep(1,ncol(object@featureLoadings)), PwNull=FALSE)
 #' @description calculate the most associated pattern for each gene
 #' @param object an object of type CogapsResult
 #' @param threshold the type of threshold to be used. The default "all" will
-#' distribute genes into pattern with the lowest ranking. The "cut" thresholds
-#' by the first gene to have a lower ranking, i.e. better fit to, a pattern.
-#' @param lp a vector of weights for each pattern to be used for finding
-#' markers. If NA markers for each pattern of the A matrix will be used.
-#' @param axis either 1 or 2, specifying if pattern markers should be calculated using
-#' the rows of the data (1) or the columns of the data (2)
+#' distribute genes into patterns with the lowest ranking as ranked by the
+#' increasing Euclidian distance between gene and the \code{lp}.
+#' @param lp list of vectors of weights for each pattern to be used for finding
+#' markers. If NULL, list of synthetic one-hot markers for each column of the
+#' featureLoadings matrix will be generated and matched against.
 #' @return By default a non-overlapping list of genes associated with each
 #' \code{lp}.
 #' @examples
 #' data(GIST)
 #' pm <- patternMarkers(GIST.result)
-setGeneric("patternMarkers", function(object, threshold="all", lp=NA, axis=1) standardGeneric("patternMarkers"))
+setGeneric("patternMarkers", function(object, threshold="all", lp=NULL) standardGeneric("patternMarkers"))
 
 #' MANOVA statistical test for patterns between sample groups
 #' @export

diff --git a/R/methods-CogapsResult.R b/R/methods-CogapsResult.R
@@ -390,114 +390,68 @@ function(patterngeneset, whichpattern=1, padj_threshold = 0.05)
 })
 
 
+#' @return By default a non-overlapping list of genes associated with each \code{lp}. If \code{full=TRUE} a data.frame of
+#' genes rankings with a column for each \code{lp} will also be returned.
 #' @rdname patternMarkers-methods
 #' @aliases patternMarkers
 setMethod("patternMarkers", signature(object="CogapsResult"),
-function(object, threshold, lp, axis)
-{
-    ## check inputs to the function
-    if (!(threshold %in% c("cut", "all")))
-        stop("threshold must be either 'cut' or 'all'")
-    if (!is.na(lp) & length(lp) != ncol(object@featureLoadings))
-        stop("lp length must equal the number of patterns")
-    if (!(axis %in% 1:2))
-        stop("axis must be either 1 or 2")
-    ## need to scale each row of the matrix of interest so that the maximum is 1
-    resultMatrix <- if (axis == 1) object@featureLoadings else object@sampleFactors
-    normedMatrix <- t(apply(resultMatrix, 1, function(row) row / max(row)))
-    ## handle the case where a custom linear combination of patterns was passed in
-    if (!is.na(lp))
-    {
-        markerScores <- apply(normedMatrix, 1, function(row) sqrt(sum((row-lp)^2)))
-        markersByPattern <- names(sort(markerScores, decreasing=FALSE, na.last=TRUE))
-        return(list(
-            "PatternMarkers"=markersByPattern,
-            "PatternMarkerRanks"=rank(markerScores),
-            "PatternMarkerScores"=markerScores
-        ))
-    }
-    ## default pattern marker calculation, each pattern has unit weight
-    markerScores <- sapply(1:ncol(normedMatrix), function(patternIndex)
-        apply(normedMatrix, 1, function(row)
-        {
-            lp <- unitVector(patternIndex, ncol(normedMatrix))
-            return(sqrt(sum((row-lp)^2)))
-        })
-    )
-    markerRanks <- apply(markerScores, 2, rank)
-    colnames(markerScores) <- colnames(markerRanks) <- colnames(normedMatrix)
-    ## keep only a subset of markers for each pattern depending on the type of threshold
-    if (threshold == "cut") # all markers which achieve minimum rank
-    {
-      simplicityGENES <- function(As, Ps) {
-        # rescale p's to have max 1
-        pscale <- apply(Ps,1,max)
-
-        # rescale A in accordance with p's having max 1
-        As <- sweep(As, 2, pscale, FUN="*")
-
-        # find the A with the highest magnitude
-        Arowmax <- t(apply(As, 1, function(x) x/max(x)))
-        pmax <- apply(As, 1, max)
-
-        # determine which genes are most associated with each pattern
-        ssl <- matrix(NA, nrow=nrow(As), ncol=ncol(As),
-                      dimnames=dimnames(As))
-        for (i in 1:ncol(As)) {
-          lp <- rep(0, ncol(As))
-          lp[i] <- 1
-          ssl.stat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))
-          ssl[order(ssl.stat),i] <- 1:length(ssl.stat)
+function(object, threshold, lp){
+    Amatrix <- object@featureLoadings
+    Pmatrix <- t(object@sampleFactors)
+
+    # determine norm for A if Ps were rescaled to have max 1
+    pscale <- apply(Pmatrix,1,max)
+
+    # rescale A in accordance with p's having max 1
+    Amatrix <- sweep(Amatrix, 2, pscale, FUN="*")
+
+    # normalize each row of A to have max 1
+    Arowmax <- t(apply(Amatrix, 1, function(x) x/max(x)))
+
+    nP=dim(Amatrix)[2]
+
+    if(!is.null(lp)){
+        if(!(unique(lengths(lp)) > 1 && unique(lengths(lp)) == nP)){
+            warning("lp length must equal the number of columns of the Amatrix")
         }
-
-        return(ssl)
-
-      }
-      simGenes <- simplicityGENES(As=object@featureLoadings,
-                                  Ps=object@sampleFactors)
-
-      patternMarkers <- list()
-
-      nP <- ncol(simGenes)
-
-      for (i in 1:nP) {
-
-        sortSim <- names(sort(simGenes[,i],decreasing=F))
-
-        geneThresh <- min(which(simGenes[sortSim,i] > 
-                                  apply(simGenes[sortSim,],1,min)))
-
-        markerGenes <- sortSim[1:geneThresh]
-
-        markerGenes <- unique(markerGenes)
-
-        patternMarkers[[i]] <- markerGenes
-
-        markersByPattern <- patternMarkers
-
-      }
-    }
-    else if (threshold == "all") # only the markers with the lowest scores
-    {
-        thresholdTest <- apply(markerScores, 1, which.min)
-        patternsByMarker <- rep(0, length(markerScores))
-        i <- 0
-        for (gene in thresholdTest) {
-          if (length(names(gene))){
-            patternsByMarker[i] <- names(gene)
-          }          
-          i <- i + 1
+        # container for feature ranks by L2 distance from lp in case of provided lp
+        ssranks<-matrix(NA, nrow=nrow(Amatrix),ncol=length(lp),
+                        dimnames=list(rownames(Amatrix), names(lp)))
+        if(max(sapply(lp,max))>1){
+            stop("lp should be a list of vectors with max value of 1")
         }
+    } else {
+        lp <- list()
+        for(i in 1:nP){
+            lp_mock <- rep(0,nP)
+            lp_mock[i] <- 1
+            lp[[i]] <- lp_mock
+            }
+        names(lp) <- colnames(Amatrix)
+        # container for feature ranks by L2 distance from lp in case of one-hot encoded lp
+        ssranks<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))
+    }
 
-        markersByPattern <- sapply(colnames(markerScores), USE.NAMES=TRUE, simplify=FALSE,
-            function(pattern) rownames(markerScores)[which(patternsByMarker==pattern)])
-
+    #calculate the L2 distance from each row of A to lp
+    for (i in seq_along(lp)){
+        sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp[[i]])%*%(x-lp[[i]])))
+        ssranks[,i] <- rank(sstat, ties.method="first")
     }
-    return(list(
-        "PatternMarkers"=markersByPattern,
-        "PatternMarkerRanks"=markerRanks,
-        "PatternMarkerScores"=markerScores
-    ))
+
+    if(threshold=="all"){
+            pIndx<-apply(ssranks,1,which.min)
+            pNames<-setNames(seq_along(lp), names(lp))
+            ssgenes.th <- lapply(pNames,function(x) names(pIndx[pIndx==x]))
+
+            #sort genes by rank for output
+            for (i in seq_along(ssgenes.th)){
+                order <- names(sort(ssranks[,i]))
+                ssgenes.th[[i]] <- intersect(order, ssgenes.th[[i]])
+            }
+    }
+
+    return(list("PatternMarkers"=ssgenes.th,"PatternRanks"=ssranks))
+
 })
 
 #' @rdname calcCoGAPSStat-methods

diff --git a/man/patternMarkers-methods.Rd b/man/patternMarkers-methods.Rd
diff --git a/tests/testthat/test_getPatternGeneSet.R b/tests/testthat/test_getPatternGeneSet.R
@@ -47,7 +47,7 @@ test_that("plotPatternGeneSet renders a plot for overrepresentation test", {
     "gs1" = c("Hs.2", "Hs.4", "Hs.36", "Hs.96", "Hs.202"),
     "gs2" = c("Hs.699463", "Hs.699288", "Hs.699280", "Hs.699154", "Hs.697294")
   )
-  gpgs_res <- getPatternGeneSet(object = GIST.result, gene.sets = gs.test, method = "overrepresentation", threshold = "cut")
+  gpgs_res <- getPatternGeneSet(object = GIST.result, gene.sets = gs.test, method = "overrepresentation")
   significant_result <- gpgs_res[[1]][gpgs_res[[1]]$gene.set == "sig_p1",]
 
   expect_true(significant_result$padj < 0.05)

diff --git a/tests/testthat/test_output_across_modes.R b/tests/testthat/test_output_across_modes.R
@@ -0,0 +1,35 @@
+library('testthat')
+
+test_that("equal A and P dimensions in sparse vs standard", {
+    data(GIST)
+    res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
+                           seed=1, messages=FALSE)
+    res_sparse <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
+                         messages=FALSE, sparseOptimization=TRUE)
+    expect_equal(dim(res_standard@featureLoadings), dim(res_sparse@featureLoadings))
+    expect_equal(dim(res_standard@sampleFactors), dim(res_sparse@sampleFactors))
+})
+
+test_that("equal A and P dimensions in sc vs standard", {
+    data(GIST)
+    res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
+                           seed=1, messages=FALSE)
+    params <- CogapsParams()
+    params <- setDistributedParams(params, nSets=2)
+    res_sc <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
+                     messages=FALSE, distributed="single-cell")
+    expect_equal(dim(res_standard@featureLoadings), dim(res_sc@featureLoadings))
+    expect_equal(dim(res_standard@sampleFactors), dim(res_sc@sampleFactors))
+})
+
+test_that("equal A and P dimensions in gw vs standard", {
+    data(GIST)
+    res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
+                           seed=1, messages=FALSE)
+    params <- CogapsParams()
+    params <- setDistributedParams(params, nSets=2)
+    res_gw <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
+                     messages=FALSE, distributed="genome-wide")
+    expect_equal(dim(res_standard@featureLoadings), dim(res_gw@featureLoadings))
+    expect_equal(dim(res_standard@sampleFactors), dim(res_gw@sampleFactors))
+})