From a38897869fe2a02dd08085c96c7bd23964148f73 Mon Sep 17 00:00:00 2001 From: Sean Hackett Date: Mon, 9 Sep 2024 12:59:20 -0700 Subject: [PATCH 1/3] added sample mahalanobis distance fxn --- NAMESPACE | 1 + R/dim_reduction.R | 74 +++++++++++++++++++ R/mutates.R | 8 +- man/calculate_sample_mahalanobis_distances.Rd | 41 ++++++++++ 4 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 man/calculate_sample_mahalanobis_distances.Rd diff --git a/NAMESPACE b/NAMESPACE index 8a83565..5e8e3e2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(add_pcs) export(app_flow) export(app_heatmap) export(app_pcs) +export(calculate_sample_mahalanobis_distances) export(center_tomic) export(check_design) export(check_tomic) diff --git a/R/dim_reduction.R b/R/dim_reduction.R index 878a3bd..a59f6d6 100644 --- a/R/dim_reduction.R +++ b/R/dim_reduction.R @@ -456,3 +456,77 @@ find_triple_omic_missing_values <- function( return(output) } + +#' Calculate Sample Mahalanobis Distances +#' +#' Determine each samples distance from the center of the data using Mahalanobis distance. +#' +#' @inheritParams tomic_to +#' @param value_var the measurement variable to use for calculating distances +#' @param max_pcs the maximum number of principal components to used for +#' representing the covariance matrix. +#' @param scale if TRUE then the data will be scaled before calculating distances +#' +#' @returns The samples tibble with a new column `pc_distance` which contains the +#' Mahalanobis distances of individual samples from the PC elipsoid +#' +#' @details +#' Since `romic` is built around using tall data where there are more features than +#' samples calculating Mahalanobis distance off of the covariance matrix is not +#' possible. Instead, we use SVD to create a low-dimensional representation of the +#' covariance matrix and calculate distances from the center of the data in this +#' space. This essentially involves weighting the principal components by their +#' loadings. +#' +#' @examples +#' calculate_sample_mahalanobis_distances(brauer_2008_tidy) +#' @export +calculate_sample_mahalanobis_distances <- function ( + tomic, + value_var = NULL, + max_pcs = 10, + scale = FALSE + ) { + + checkmate::assertClass(tomic, "tomic") + value_var <- romic:::value_var_handler(value_var, tomic$design) + checkmate::assertIntegerish(max_pcs, len = 1) + + n_features <- nrow(romic::get_tomic_table(tomic, "features")) + n_samples <- nrow(romic::get_tomic_table(tomic, "samples")) + npcs <- pmin(max_pcs, n_features, n_samples) + + X_std <- tomic %>% + # remove features with missing values + romic::remove_missing_values(value_var, "drop_features", verbose = FALSE) %>% + # convert to a matrix + romic::tomic_to_matrix(value_var) %>% + {scale(t(.), scale = scale) %>% t()} + + # if X is constant for a feature (such all reads are 0) then + # scaling will put it to NaN. We need to remove these features + valid_X_std <- X_std[rowSums(is.nan(X_std)) == 0,] + + # perform a singular value decomposition + svd_results <- svd(valid_X_std, nv = npcs) + + # re-weight principal components by eigenvalues + X_std_v <- svd_results$v * matrix(svd_results$d[1:npcs], ncol = npcs, nrow = ncol(valid_X_std), byrow = TRUE) + rownames(X_std_v) <- colnames(valid_X_std) + + pc_distances_by_sample <- tibble::tibble( + !!rlang::sym(tomic$design$sample_pk) := rownames(X_std_v), + pc_distance = rowSums(X_std_v^2) + ) %>% + arrange(desc(pc_distance)) + + # validate calculation + varex_leading_components <- sum(svd_results$d^2) * sum((svd_results$d^2 / sum(svd_results$d^2))[1:npcs]) + + stopifnot((sum(pc_distances_by_sample$pc_distance) - varex_leading_components)/varex_leading_components < 0.01) + + samples_w_pc_distances <- romic::get_tomic_table(tomic, "samples") %>% + dplyr::left_join(pc_distances_by_sample, by = tomic$design$sample_pk) + + return(samples_w_pc_distances) +} diff --git a/R/mutates.R b/R/mutates.R index 0bdf04f..54b0195 100644 --- a/R/mutates.R +++ b/R/mutates.R @@ -151,9 +151,11 @@ center <- function(x) { #' select(-DR) #' new_variable_tables <- c("new_sample_var" = "samples") #' @export -update_tidy_omic <- function(tidy_omic, - updated_tidy_data, - new_variable_tables = c()) { +update_tidy_omic <- function( + tidy_omic, + updated_tidy_data, + new_variable_tables = c() + ) { checkmate::assertClass(tidy_omic, "tomic") checkmate::assertClass(tidy_omic, "tidy_omic") checkmate::assertDataFrame(updated_tidy_data) diff --git a/man/calculate_sample_mahalanobis_distances.Rd b/man/calculate_sample_mahalanobis_distances.Rd new file mode 100644 index 0000000..dd11675 --- /dev/null +++ b/man/calculate_sample_mahalanobis_distances.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dim_reduction.R +\name{calculate_sample_mahalanobis_distances} +\alias{calculate_sample_mahalanobis_distances} +\title{Calculate Sample Mahalanobis Distances} +\usage{ +calculate_sample_mahalanobis_distances( + tomic, + value_var = NULL, + max_pcs = 10, + scale = FALSE +) +} +\arguments{ +\item{tomic}{Either a \code{tidy_omic} or \code{triple_omic} object} + +\item{value_var}{the measurement variable to use for calculating distances} + +\item{max_pcs}{the maximum number of principal components to used for +representing the covariance matrix.} + +\item{scale}{if TRUE then the data will be scaled before calculating distances} +} +\value{ +The samples tibble with a new column `pc_distance` which contains the + Mahalanobis distances of individual samples from the PC elipsoid +} +\description{ +Determine each samples distance from the center of the data using Mahalanobis distance. +} +\details{ +Since `romic` is built around using tall data where there are more features than +samples calculating Mahalanobis distance off of the covariance matrix is not +possible. Instead, we use SVD to create a low-dimensional representation of the +covariance matrix and calculate distances from the center of the data in this +space. This essentially involves weighting the principal components by their +loadings. +} +\examples{ +calculate_sample_mahalanobis_distances(brauer_2008_tidy) +} From a217c1ab896046412819776da6b8c2b644b3c790 Mon Sep 17 00:00:00 2001 From: Sean Hackett Date: Mon, 9 Sep 2024 13:03:03 -0700 Subject: [PATCH 2/3] updated get_design_tbl to work with just the design --- R/design.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/R/design.R b/R/design.R index ef5d0d1..7657a5a 100644 --- a/R/design.R +++ b/R/design.R @@ -2,15 +2,25 @@ #' #' Get a tabular summary of all variables. #' -#' @inheritParams tomic_to +#' @param tomic_or_design Either a \code{tomic} object or its embedded design list #' #' @returns a tibble reflecting the \code{tomic} object's design. #' #' @examples #' get_design_tbl(brauer_2008_triple) +#' get_design_tbl(brauer_2008_triple$design) +#' #' @export -get_design_tbl <- function(tomic) { - tomic$design[c("features", "samples", "measurements")] %>% +get_design_tbl <- function(tomic_or_design) { + + if (inherits(tomic_or_design, "tomic")) { + design <- tomic_or_design$design + } else { + check_design(tomic_or_design) + design <- tomic_or_design + } + + design[c("features", "samples", "measurements")] %>% { purrr::map2(unname(.), names(.), function(x, y) { x %>% From 5cc2f4431e01740c24f89a0df5948d27d5a0cdc7 Mon Sep 17 00:00:00 2001 From: Sean Hackett Date: Mon, 9 Sep 2024 13:11:38 -0700 Subject: [PATCH 3/3] passing r cmd check --- DESCRIPTION | 2 +- R/dim_reduction.R | 4 ++-- R/romic-package.R | 1 + man/get_design_tbl.Rd | 6 ++++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3cf63ae..4fbc607 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: romic Type: Package Title: R for High-Dimensional Omic Data -Version: 1.2.2 +Version: 1.2.3 Authors@R: c( person( given = "Sean", diff --git a/R/dim_reduction.R b/R/dim_reduction.R index a59f6d6..2a4cff8 100644 --- a/R/dim_reduction.R +++ b/R/dim_reduction.R @@ -489,7 +489,7 @@ calculate_sample_mahalanobis_distances <- function ( ) { checkmate::assertClass(tomic, "tomic") - value_var <- romic:::value_var_handler(value_var, tomic$design) + value_var <- value_var_handler(value_var, tomic$design) checkmate::assertIntegerish(max_pcs, len = 1) n_features <- nrow(romic::get_tomic_table(tomic, "features")) @@ -518,7 +518,7 @@ calculate_sample_mahalanobis_distances <- function ( !!rlang::sym(tomic$design$sample_pk) := rownames(X_std_v), pc_distance = rowSums(X_std_v^2) ) %>% - arrange(desc(pc_distance)) + dplyr::arrange(dplyr::desc(pc_distance)) # validate calculation varex_leading_components <- sum(svd_results$d^2) * sum((svd_results$d^2 / sum(svd_results$d^2))[1:npcs]) diff --git a/R/romic-package.R b/R/romic-package.R index e603a48..8895225 100644 --- a/R/romic-package.R +++ b/R/romic-package.R @@ -40,6 +40,7 @@ utils::globalVariables(c( "sample_label", "ordered_sampleId", "orderedId", + "pc_distance", "valid_tables", "NAME", "name", diff --git a/man/get_design_tbl.Rd b/man/get_design_tbl.Rd index ee2e387..de27b68 100644 --- a/man/get_design_tbl.Rd +++ b/man/get_design_tbl.Rd @@ -4,10 +4,10 @@ \alias{get_design_tbl} \title{Get Design Table} \usage{ -get_design_tbl(tomic) +get_design_tbl(tomic_or_design) } \arguments{ -\item{tomic}{Either a \code{tidy_omic} or \code{triple_omic} object} +\item{tomic_or_design}{Either a \code{tomic} object or its embedded design list} } \value{ a tibble reflecting the \code{tomic} object's design. @@ -17,4 +17,6 @@ Get a tabular summary of all variables. } \examples{ get_design_tbl(brauer_2008_triple) +get_design_tbl(brauer_2008_triple$design) + }