From d06202189b38924402c2ed493a718c04ffab298d Mon Sep 17 00:00:00 2001 From: Sean Hackett Date: Fri, 6 Sep 2024 16:47:04 -0700 Subject: [PATCH] update sample ordering and filtering nans and infs --- NAMESPACE | 1 + R/dim_reduction.R | 65 ++++++++++++++++++------- R/mutates.R | 92 ++++++++++++++++++++++++++++++++++++ man/remove_missing_values.Rd | 6 ++- man/update_sample_factors.Rd | 26 ++++++++++ 5 files changed, 173 insertions(+), 17 deletions(-) create mode 100644 man/update_sample_factors.Rd diff --git a/NAMESPACE b/NAMESPACE index d7ac5e7..8a83565 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(tomic_sort_status) export(tomic_to) export(tomic_to_matrix) export(triple_to_tidy) +export(update_sample_factors) export(update_tidy_omic) export(update_tomic) import(ggplot2) diff --git a/R/dim_reduction.R b/R/dim_reduction.R index 8d6f9ee..878a3bd 100644 --- a/R/dim_reduction.R +++ b/R/dim_reduction.R @@ -133,6 +133,8 @@ add_pcs <- function( #' then drop features} #' \item{impute}{Impute missing values} #' } +#' @param missing_value_types Types of values to treat as missing. One or more +#' of "NA", "NaN" and "Inf" #' @inheritParams create_tidy_omic #' #' @returns A \code{tomic} object where missing values have been accounted @@ -143,18 +145,24 @@ add_pcs <- function( #' #' @export remove_missing_values <- function( - tomic, - value_var = NULL, - missing_val_method = "drop_samples", - verbose = TRUE - ) { + tomic, + value_var = NULL, + missing_val_method = "drop_samples", + missing_value_types = c("NA", "NaN", "Inf"), + verbose = FALSE + ) { + checkmate::assertClass(tomic, "tomic") checkmate::assertChoice( missing_val_method, c("drop_features", "drop_samples") ) + checkmate::assertCharacter(missing_value_types, min.len = 1) checkmate::assertLogical(verbose, len = 1) + VALID_MISSING_VALUE_TYPES <- c("NA", "NaN", "Inf") + purrr::walk(missing_value_types, checkmate::assertChoice, VALID_MISSING_VALUE_TYPES) + triple_omic <- tomic_to(tomic, "triple_omic") design <- tomic$design @@ -167,7 +175,8 @@ remove_missing_values <- function( # find missing values of value_var found_missing_values <- find_triple_omic_missing_values( triple_omic, - value_var + value_var, + missing_value_types ) observed_measurements <- found_missing_values$observed_measurements missing_values <- found_missing_values$missing_values @@ -248,11 +257,11 @@ remove_missing_values <- function( #' #' @export impute_missing_values <- function( - tomic, - impute_var_name = "imputed", - value_var = NULL, - ... - ){ + tomic, + impute_var_name = "imputed", + value_var = NULL, + ... + ){ if (!("impute" %in% rownames(utils::installed.packages()))) { stop("Install \"impute\" using remotes::install_bioc(\"impute\") to use this function") @@ -296,7 +305,7 @@ impute_missing_values <- function( omic_matrix <- triple_omic$measurements %>% reshape2::acast(formula = cast_formula, value.var = value_var) - # imput data + # impute imputed_measurements <- impute::impute.knn( omic_matrix, ... @@ -397,15 +406,39 @@ value_var_handler <- function(value_var = NULL, design) { return(value_var) } -find_triple_omic_missing_values <- function(triple_omic, value_var) { +find_triple_omic_missing_values <- function( + triple_omic, + value_var, + missing_values_types = c("NA", "NaN", "Inf") + ) { + all_expected_obs <- tidyr::expand_grid( triple_omic$features[triple_omic$design$feature_pk], triple_omic$samples[triple_omic$design$sample_pk] ) - observed_measurements <- triple_omic$measurements %>% - # drop missing values - dplyr::filter_at(value_var, function(x) !is.na(x)) + observed_measurements <- triple_omic$measurements + + if ("NA" %in% missing_values_types) { + observed_measurements <- observed_measurements %>% + dplyr::filter( + !is.na(!!rlang::sym(value_var)) + ) + } + + if ("NaN" %in% missing_values_types) { + observed_measurements <- observed_measurements %>% + dplyr::filter( + !is.nan(!!rlang::sym(value_var)) + ) + } + + if ("Inf" %in% missing_values_types) { + observed_measurements <- observed_measurements %>% + dplyr::filter( + is.finite(!!rlang::sym(value_var)) + ) + } missing_values <- all_expected_obs %>% dplyr::anti_join( diff --git a/R/mutates.R b/R/mutates.R index 657853d..0bdf04f 100644 --- a/R/mutates.R +++ b/R/mutates.R @@ -495,3 +495,95 @@ tomic_sort_status <- function(tomic) { return(status) } + +#' Update Sample Factors +#' +#' Update sample metadata to order categorical variables based on a +#' specified factor order. +#' +#' @inheritParams tomic_to +#' @param factor_levels a character vector specifying the ordering of factor levels. +#' +#' @returns a tomic object with updated sample metadata +#' +#' @examples +#' update_sample_factors( +#' brauer_2008_tidy, list(nutrient = c("G", "N", "P", "S", "L", "U")) +#' ) +#' +#' @export +update_sample_factors <- function (tomic, factor_levels) { + + checkmate::assertClass(tomic, "tomic") + checkmate::assertNamed(factor_levels) + checkmate::assertList(factor_levels) + + samples <- romic::get_tomic_table(tomic, "samples") + purrr::walk(names(factor_levels), checkmate::assertChoice, colnames(samples)) + + # update all categorical variables with specified factor orders + for (fct in names(factor_levels)) { + samples[[fct]] <- set_factor_levels(samples[[fct]], factor_levels[[fct]], fct) + } + + out <- romic::update_tomic(tomic, samples) + + return(out) +} + +set_factor_levels <- function(samples_vec, fct_levels, fct_label = "?") { + + # validate factor orders + if (!("character" %in% class(fct_levels))) { + cli::cli_abort( + "The factor levels for {fct_label} were {.val {class(fct_levels)}}. + This should be a character vector." + ) + } + + duplicated_levels <- unique(fct_levels[duplicated(fct_levels)]) + if (length(duplicated_levels) > 0) { + cli::cli_abort( + "{length(duplicated_levels)} factor levels {?was/were} duplicated in the `factor_levels` specification for + {.val {fct_label}}: {duplicated_levels}" + ) + } + + if ("character" %in% class(samples_vec)) { + + extra_sample_vars <- setdiff(samples_vec, fct_levels) + if (length(extra_sample_vars)) { + cli::cli_alert_warning( + "{.val {extra_sample_vars}} {?was/were} present in the sample metadata's {.field {fct_label}} field but did not have a corresponding factor level in the {.arg factor_levels} list. They will be added to the end of the specified factor levels" + ) + + fct_levels <- c(fct_levels, extra_sample_vars) + } + + missing_sample_vars <- setdiff(fct_levels, samples_vec) + if (length(missing_sample_vars)) { + cli::cli_alert_warning( + "{.val {missing_sample_vars}} {?was/were} present in {.arg factor_levels} for {.field {fct_label}} but did not have a corresponding entry in the sample metadata." + ) + } + + if (any(is.na(samples_vec))) { + cli::cli_alert_warning( + "The {.field {fct_label}} field in the sample metadata contains {sum(is.na(samples_vec))} NA values. These entries will be replaced with an {.val unspecified} level.") + + samples_vec[is.na(samples_vec)] <- "unspecified" + fct_levels <- c(fct_levels, "unspecified") + } + + samples_fct_vec <- factor( + samples_vec, + levels = fct_levels + ) + } else { + cli::cli_abort( + "The factor levels for fct were {.val {class(fct)}} and cannot be converted + to factors using the specified factor orders.") + } + + return(samples_fct_vec) +} diff --git a/man/remove_missing_values.Rd b/man/remove_missing_values.Rd index 20327ad..6d9ba02 100644 --- a/man/remove_missing_values.Rd +++ b/man/remove_missing_values.Rd @@ -8,7 +8,8 @@ remove_missing_values( tomic, value_var = NULL, missing_val_method = "drop_samples", - verbose = TRUE + missing_value_types = c("NA", "NaN", "Inf"), + verbose = FALSE ) } \arguments{ @@ -24,6 +25,9 @@ remove_missing_values( \item{impute}{Impute missing values} }} +\item{missing_value_types}{Types of values to treat as missing. One or more +of "NA", "NaN" and "Inf"} + \item{verbose}{extra reporting messages} } \value{ diff --git a/man/update_sample_factors.Rd b/man/update_sample_factors.Rd new file mode 100644 index 0000000..bc679bd --- /dev/null +++ b/man/update_sample_factors.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mutates.R +\name{update_sample_factors} +\alias{update_sample_factors} +\title{Update Sample Factors} +\usage{ +update_sample_factors(tomic, factor_levels) +} +\arguments{ +\item{tomic}{Either a \code{tidy_omic} or \code{triple_omic} object} + +\item{factor_levels}{a character vector specifying the ordering of factor levels.} +} +\value{ +a tomic object with updated sample metadata +} +\description{ +Update sample metadata to order categorical variables based on a +specified factor order. +} +\examples{ +update_sample_factors( + brauer_2008_tidy, list(nutrient = c("G", "N", "P", "S", "L", "U")) +) + +}