Skip to content

Commit

Permalink
Merge pull request #103 from FertigLab/102-patternmarkers-all-returni…
Browse files Browse the repository at this point in the history
…ng-empty-lists

- rewrite method for `threshold="all"`
- fix passing custom `lp`
- improve documentation
- remove `threshold="cut"` until it's clear what I does
  • Loading branch information
dimalvovs authored Jun 5, 2024
2 parents f6287a2 + aea5f7b commit 45fdd42
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 143 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: CoGAPS
Version: 3.24.1
Date: 2024-03-22
Version: 3.25.0
Date: 2024-06-05
Title: Coordinated Gene Activity in Pattern Sets
Author: Jeanette Johnson, Ashley Tsang, Jacob Mitchell, Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey,
Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield,
Expand Down Expand Up @@ -52,7 +52,7 @@ biocViews: GeneExpression, Transcription, GeneSetEnrichment,
DifferentialExpression, Bayesian, Clustering, TimeCourse, RNASeq, Microarray,
MultipleComparison, DimensionReduction, ImmunoOncology
NeedsCompilation: yes
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Encoding: UTF-8
Collate:
'class-CogapsParams.R'
Expand Down
13 changes: 6 additions & 7 deletions R/class-CogapsResult.R
Original file line number Diff line number Diff line change
Expand Up @@ -393,18 +393,17 @@ Pw=rep(1,ncol(object@featureLoadings)), PwNull=FALSE)
#' @description calculate the most associated pattern for each gene
#' @param object an object of type CogapsResult
#' @param threshold the type of threshold to be used. The default "all" will
#' distribute genes into pattern with the lowest ranking. The "cut" thresholds
#' by the first gene to have a lower ranking, i.e. better fit to, a pattern.
#' @param lp a vector of weights for each pattern to be used for finding
#' markers. If NA markers for each pattern of the A matrix will be used.
#' @param axis either 1 or 2, specifying if pattern markers should be calculated using
#' the rows of the data (1) or the columns of the data (2)
#' distribute genes into patterns with the lowest ranking as ranked by the
#' increasing Euclidian distance between gene and the \code{lp}.
#' @param lp list of vectors of weights for each pattern to be used for finding
#' markers. If NULL, list of synthetic one-hot markers for each column of the
#' featureLoadings matrix will be generated and matched against.
#' @return By default a non-overlapping list of genes associated with each
#' \code{lp}.
#' @examples
#' data(GIST)
#' pm <- patternMarkers(GIST.result)
setGeneric("patternMarkers", function(object, threshold="all", lp=NA, axis=1) standardGeneric("patternMarkers"))
setGeneric("patternMarkers", function(object, threshold="all", lp=NULL) standardGeneric("patternMarkers"))

#' MANOVA statistical test for patterns between sample groups
#' @export
Expand Down
156 changes: 55 additions & 101 deletions R/methods-CogapsResult.R
Original file line number Diff line number Diff line change
Expand Up @@ -390,114 +390,68 @@ function(patterngeneset, whichpattern=1, padj_threshold = 0.05)
})


#' @return By default a non-overlapping list of genes associated with each \code{lp}. If \code{full=TRUE} a data.frame of
#' genes rankings with a column for each \code{lp} will also be returned.
#' @rdname patternMarkers-methods
#' @aliases patternMarkers
setMethod("patternMarkers", signature(object="CogapsResult"),
function(object, threshold, lp, axis)
{
## check inputs to the function
if (!(threshold %in% c("cut", "all")))
stop("threshold must be either 'cut' or 'all'")
if (!is.na(lp) & length(lp) != ncol(object@featureLoadings))
stop("lp length must equal the number of patterns")
if (!(axis %in% 1:2))
stop("axis must be either 1 or 2")
## need to scale each row of the matrix of interest so that the maximum is 1
resultMatrix <- if (axis == 1) object@featureLoadings else object@sampleFactors
normedMatrix <- t(apply(resultMatrix, 1, function(row) row / max(row)))
## handle the case where a custom linear combination of patterns was passed in
if (!is.na(lp))
{
markerScores <- apply(normedMatrix, 1, function(row) sqrt(sum((row-lp)^2)))
markersByPattern <- names(sort(markerScores, decreasing=FALSE, na.last=TRUE))
return(list(
"PatternMarkers"=markersByPattern,
"PatternMarkerRanks"=rank(markerScores),
"PatternMarkerScores"=markerScores
))
}
## default pattern marker calculation, each pattern has unit weight
markerScores <- sapply(1:ncol(normedMatrix), function(patternIndex)
apply(normedMatrix, 1, function(row)
{
lp <- unitVector(patternIndex, ncol(normedMatrix))
return(sqrt(sum((row-lp)^2)))
})
)
markerRanks <- apply(markerScores, 2, rank)
colnames(markerScores) <- colnames(markerRanks) <- colnames(normedMatrix)
## keep only a subset of markers for each pattern depending on the type of threshold
if (threshold == "cut") # all markers which achieve minimum rank
{
simplicityGENES <- function(As, Ps) {
# rescale p's to have max 1
pscale <- apply(Ps,1,max)

# rescale A in accordance with p's having max 1
As <- sweep(As, 2, pscale, FUN="*")

# find the A with the highest magnitude
Arowmax <- t(apply(As, 1, function(x) x/max(x)))
pmax <- apply(As, 1, max)

# determine which genes are most associated with each pattern
ssl <- matrix(NA, nrow=nrow(As), ncol=ncol(As),
dimnames=dimnames(As))
for (i in 1:ncol(As)) {
lp <- rep(0, ncol(As))
lp[i] <- 1
ssl.stat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))
ssl[order(ssl.stat),i] <- 1:length(ssl.stat)
function(object, threshold, lp){
Amatrix <- object@featureLoadings
Pmatrix <- t(object@sampleFactors)

# determine norm for A if Ps were rescaled to have max 1
pscale <- apply(Pmatrix,1,max)

# rescale A in accordance with p's having max 1
Amatrix <- sweep(Amatrix, 2, pscale, FUN="*")

# normalize each row of A to have max 1
Arowmax <- t(apply(Amatrix, 1, function(x) x/max(x)))

nP=dim(Amatrix)[2]

if(!is.null(lp)){
if(!(unique(lengths(lp)) > 1 && unique(lengths(lp)) == nP)){
warning("lp length must equal the number of columns of the Amatrix")
}

return(ssl)

}
simGenes <- simplicityGENES(As=object@featureLoadings,
Ps=object@sampleFactors)

patternMarkers <- list()

nP <- ncol(simGenes)

for (i in 1:nP) {

sortSim <- names(sort(simGenes[,i],decreasing=F))

geneThresh <- min(which(simGenes[sortSim,i] >
apply(simGenes[sortSim,],1,min)))

markerGenes <- sortSim[1:geneThresh]

markerGenes <- unique(markerGenes)

patternMarkers[[i]] <- markerGenes

markersByPattern <- patternMarkers

}
}
else if (threshold == "all") # only the markers with the lowest scores
{
thresholdTest <- apply(markerScores, 1, which.min)
patternsByMarker <- rep(0, length(markerScores))
i <- 0
for (gene in thresholdTest) {
if (length(names(gene))){
patternsByMarker[i] <- names(gene)
}
i <- i + 1
# container for feature ranks by L2 distance from lp in case of provided lp
ssranks<-matrix(NA, nrow=nrow(Amatrix),ncol=length(lp),
dimnames=list(rownames(Amatrix), names(lp)))
if(max(sapply(lp,max))>1){
stop("lp should be a list of vectors with max value of 1")
}
} else {
lp <- list()
for(i in 1:nP){
lp_mock <- rep(0,nP)
lp_mock[i] <- 1
lp[[i]] <- lp_mock
}
names(lp) <- colnames(Amatrix)
# container for feature ranks by L2 distance from lp in case of one-hot encoded lp
ssranks<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))
}

markersByPattern <- sapply(colnames(markerScores), USE.NAMES=TRUE, simplify=FALSE,
function(pattern) rownames(markerScores)[which(patternsByMarker==pattern)])

#calculate the L2 distance from each row of A to lp
for (i in seq_along(lp)){
sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp[[i]])%*%(x-lp[[i]])))
ssranks[,i] <- rank(sstat, ties.method="first")
}
return(list(
"PatternMarkers"=markersByPattern,
"PatternMarkerRanks"=markerRanks,
"PatternMarkerScores"=markerScores
))

if(threshold=="all"){
pIndx<-apply(ssranks,1,which.min)
pNames<-setNames(seq_along(lp), names(lp))
ssgenes.th <- lapply(pNames,function(x) names(pIndx[pIndx==x]))

#sort genes by rank for output
for (i in seq_along(ssgenes.th)){
order <- names(sort(ssranks[,i]))
ssgenes.th[[i]] <- intersect(order, ssgenes.th[[i]])
}
}

return(list("PatternMarkers"=ssgenes.th,"PatternRanks"=ssranks))

})

#' @rdname calcCoGAPSStat-methods
Expand Down
19 changes: 10 additions & 9 deletions man/patternMarkers-methods.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test_getPatternGeneSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ test_that("plotPatternGeneSet renders a plot for overrepresentation test", {
"gs1" = c("Hs.2", "Hs.4", "Hs.36", "Hs.96", "Hs.202"),
"gs2" = c("Hs.699463", "Hs.699288", "Hs.699280", "Hs.699154", "Hs.697294")
)
gpgs_res <- getPatternGeneSet(object = GIST.result, gene.sets = gs.test, method = "overrepresentation", threshold = "cut")
gpgs_res <- getPatternGeneSet(object = GIST.result, gene.sets = gs.test, method = "overrepresentation")
significant_result <- gpgs_res[[1]][gpgs_res[[1]]$gene.set == "sig_p1",]

expect_true(significant_result$padj < 0.05)
Expand Down
35 changes: 35 additions & 0 deletions tests/testthat/test_output_across_modes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
library('testthat')

test_that("equal A and P dimensions in sparse vs standard", {
data(GIST)
res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
seed=1, messages=FALSE)
res_sparse <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
messages=FALSE, sparseOptimization=TRUE)
expect_equal(dim(res_standard@featureLoadings), dim(res_sparse@featureLoadings))
expect_equal(dim(res_standard@sampleFactors), dim(res_sparse@sampleFactors))
})

test_that("equal A and P dimensions in sc vs standard", {
data(GIST)
res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
seed=1, messages=FALSE)
params <- CogapsParams()
params <- setDistributedParams(params, nSets=2)
res_sc <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
messages=FALSE, distributed="single-cell")
expect_equal(dim(res_standard@featureLoadings), dim(res_sc@featureLoadings))
expect_equal(dim(res_standard@sampleFactors), dim(res_sc@sampleFactors))
})

test_that("equal A and P dimensions in gw vs standard", {
data(GIST)
res_standard <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100,
seed=1, messages=FALSE)
params <- CogapsParams()
params <- setDistributedParams(params, nSets=2)
res_gw <- CoGAPS(GIST.data_frame, nPatterns=2, nIterations=100, seed=1,
messages=FALSE, distributed="genome-wide")
expect_equal(dim(res_standard@featureLoadings), dim(res_gw@featureLoadings))
expect_equal(dim(res_standard@sampleFactors), dim(res_gw@sampleFactors))
})
Loading

0 comments on commit 45fdd42

Please sign in to comment.