From d1bc235c8240622f70bce6b3f39e1355e020cf55 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Thu, 19 Dec 2024 23:57:15 +1100 Subject: [PATCH 1/2] add method and test --- DESCRIPTION | 2 +- NAMESPACE | 5 ++ R/dplyr_methods.R | 30 ++++++++- R/utilities.R | 41 ++++++++++++ man/anti_join.Rd | 97 +++++++++++++++++++++++++++++ tests/testthat/test-dplyr_methods.R | 11 ++++ 6 files changed, 182 insertions(+), 4 deletions(-) create mode 100644 man/anti_join.Rd diff --git a/DESCRIPTION b/DESCRIPTION index cb937a6..54c0045 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: tidySingleCellExperiment Title: Brings SingleCellExperiment to the Tidyverse -Version: 1.15.5 +Version: 1.15.6 Authors@R: c(person("Stefano", "Mangiola", comment=c(ORCID="0000-0001-7474-836X"), email="mangiolastefano@gmail.com", diff --git a/NAMESPACE b/NAMESPACE index 9eaa0db..653f685 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(add_count,SingleCellExperiment) +S3method(anti_join,SingleCellExperiment) S3method(arrange,SingleCellExperiment) S3method(as_tibble,SingleCellExperiment) S3method(bind_cols,SingleCellExperiment) @@ -63,7 +64,11 @@ importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) +importFrom(base,get) +importFrom(base,identical) +importFrom(base,ls) importFrom(dplyr,add_count) +importFrom(dplyr,anti_join) importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,contains) diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 8a18f54..5ac1400 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -329,16 +329,22 @@ rowwise.SingleCellExperiment <- function(data, ...) { .join_factory <- function(fun, change_x) { function(x, y, by=NULL, copy=FALSE, suffix=c(".x", ".y"), ...) { - + # Deprecation of special column names .cols <- if (!is.null(by)) by else colnames(y) if (is_sample_feature_deprecated_used(x, .cols)) { x <- ping_old_special_column_into_metadata(x) } if (is(y, "DataFrame")) y <- as.data.frame(y) - z <- x |> + + if(get_function_name(fun) == "anti_join") + z <- x |> as_tibble() |> - fun(y, by=by, copy=copy, suffix=suffix, ...) + fun(y, by=by, copy=copy, ...) + else + z <- x |> + as_tibble() |> + fun(y, by=by, copy=copy, suffix=suffix, ...) # If duplicated cells returns tibble if (any(duplicated(z[[c_(x)$name]]))) { @@ -397,6 +403,24 @@ left_join.SingleCellExperiment <- .join_factory(dplyr::left_join, FALSE) #' @export inner_join.SingleCellExperiment <- .join_factory(dplyr::inner_join, TRUE) +#' @name anti_join +#' @rdname anti_join +#' @inherit dplyr::anti_join +#' +#' @examples +#' data(pbmc_small) +#' tt <- pbmc_small +#' tt |> anti_join(tt |> +#' distinct(groups) |> +#' mutate(new_column=1:2) |> +#' slice(1)) +#' +#' @importFrom SummarizedExperiment colData +#' @importFrom dplyr anti_join +#' @importFrom dplyr pull +#' @export +anti_join.SingleCellExperiment <- .join_factory(dplyr::anti_join, TRUE) + #' @name right_join #' @rdname right_join #' @inherit dplyr::right_join diff --git a/R/utilities.R b/R/utilities.R index 76d5ee1..2b96427 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -521,6 +521,47 @@ splitColData <- function(x, f) { } +#' Retrieve the Original Name of a Function +#' +#' This function identifies the original name of a function object by searching +#' its environment. It works for functions defined in packages, global +#' environments, or custom user-defined functions. +#' +#' @param f A function object whose original name you want to retrieve. +#' +#' @return A character string with the name of the function if found, or `NULL` +#' if the name cannot be determined (e.g., for anonymous functions). +#' +#' @details The function works by inspecting the environment where the function +#' is defined. It searches for objects that are identical to the provided +#' function and returns the name of the first match. For package functions, +#' the search is limited to the package's namespace. +#' +#' @examples +#' library(dplyr) +#' fun <- anti_join +#' get_function_name(fun) # Returns "anti_join" +#' +#' custom_fun <- function(x) x^2 +#' get_function_name(custom_fun) # Returns "custom_fun" +#' +#' # Anonymous function +#' get_function_name(function(x) x^2) # Returns NULL +#' +#' @importFrom base identical ls get +#' @noRd +get_function_name <- function(f) { + if (is.function(f)) { + env <- environment(f) # The environment where the function is defined + all_names <- ls(env) # List all names in that environment + matches <- all_names[sapply(all_names, function(n) identical(get(n, envir = env), f))] + if (length(matches) > 0) { + return(matches[1]) # Return the first match + } + } + return(NULL) +} + cell__ <- get_special_column_name_symbol(".cell") feature__ <- get_special_column_name_symbol(".feature") sample__ <- get_special_column_name_symbol(".sample") diff --git a/man/anti_join.Rd b/man/anti_join.Rd new file mode 100644 index 0000000..522dd0e --- /dev/null +++ b/man/anti_join.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{anti_join} +\alias{anti_join} +\alias{anti_join.SingleCellExperiment} +\title{Filtering joins} +\usage{ +\method{anti_join}{SingleCellExperiment}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x}. The output has the following properties: +\itemize{ +\item Rows are a subset of the input, but appear in the same order. +\item Columns are not modified. +\item Data frame attributes are preserved. +\item Groups are taken from \code{x}. The number of groups may be reduced. +} +} +\description{ +Filtering joins filter rows from \code{x} based on the presence or absence +of matches in \code{y}: +\itemize{ +\item \code{semi_join()} return all rows from \code{x} with a match in \code{y}. +\item \code{anti_join()} return all rows from \code{x} with\strong{out} a match in \code{y}. +} +} +\section{Methods}{ + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{semi_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("semi_join")}. +\item \code{anti_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("anti_join")}. +} + +} + +\examples{ +data(pbmc_small) +tt <- pbmc_small +tt |> anti_join(tt |> + distinct(groups) |> + mutate(new_column=1:2) |> + slice(1)) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{mutate-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/tests/testthat/test-dplyr_methods.R b/tests/testthat/test-dplyr_methods.R index 93f0541..46cb83f 100755 --- a/tests/testthat/test-dplyr_methods.R +++ b/tests/testthat/test-dplyr_methods.R @@ -224,6 +224,17 @@ test_that("full_join(), with DataFrame y", { # mutate(df, factor=paste(factor))) }) +test_that("anti_join()", { + y <- df |> + distinct(factor) |> + mutate(string=letters[seq(nlevels(df$factor))]) |> + filter(factor !="g1") + fd <- anti_join(df, y, by="factor") + expect_s4_class(fd, "SingleCellExperiment") + expect_equal(n <- ncol(colData(fd)), ncol(colData(df))) + expect_lt(ncol(fd), ncol(df)) +}) + test_that("slice()", { # I DON'T KNOW WHY THESE TESTS GIVES WARNING # Please use `all_of()` or `any_of()` instead. From e3615b0ed623ad59653c001f4e560d5071cb2483 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Fri, 20 Dec 2024 00:38:01 +1100 Subject: [PATCH 2/2] fix CHECKs --- NAMESPACE | 3 --- R/dplyr_methods.R | 37 +++++++++++++++++++++++++++++++------ R/utilities.R | 41 +---------------------------------------- man/anti_join.Rd | 2 +- 4 files changed, 33 insertions(+), 50 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 653f685..c8e6f8e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,9 +64,6 @@ importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) -importFrom(base,get) -importFrom(base,identical) -importFrom(base,ls) importFrom(dplyr,add_count) importFrom(dplyr,anti_join) importFrom(dplyr,any_of) diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 5ac1400..3542986 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -337,11 +337,6 @@ rowwise.SingleCellExperiment <- function(data, ...) { } if (is(y, "DataFrame")) y <- as.data.frame(y) - if(get_function_name(fun) == "anti_join") - z <- x |> - as_tibble() |> - fun(y, by=by, copy=copy, ...) - else z <- x |> as_tibble() |> fun(y, by=by, copy=copy, suffix=suffix, ...) @@ -361,6 +356,36 @@ rowwise.SingleCellExperiment <- function(data, ...) { } } +.join_factory_anti_join <- function(fun, change_x) { + function(x, y, + by=NULL, copy=FALSE, ...) { + + # Deprecation of special column names + .cols <- if (!is.null(by)) by else colnames(y) + if (is_sample_feature_deprecated_used(x, .cols)) { + x <- ping_old_special_column_into_metadata(x) + } + if (is(y, "DataFrame")) y <- as.data.frame(y) + + z <- x |> + as_tibble() |> + fun(y, by=by, copy=copy, ...) + + # If duplicated cells returns tibble + if (any(duplicated(z[[c_(x)$name]]))) { + message(duplicated_cell_names) + return(z) + } + + # Otherwise return updated tidySingleCellExperiment + if (change_x) + new_obj <- x[, pull(z, c_(x)$name)] + else new_obj <- x + colData(new_obj) <- z |> as_meta_data(new_obj) + return(new_obj) + } +} + #' @name left_join #' @rdname left_join #' @inherit dplyr::left_join @@ -419,7 +444,7 @@ inner_join.SingleCellExperiment <- .join_factory(dplyr::inner_join, TRUE) #' @importFrom dplyr anti_join #' @importFrom dplyr pull #' @export -anti_join.SingleCellExperiment <- .join_factory(dplyr::anti_join, TRUE) +anti_join.SingleCellExperiment <- .join_factory_anti_join(dplyr::anti_join, TRUE) #' @name right_join #' @rdname right_join diff --git a/R/utilities.R b/R/utilities.R index 2b96427..954c579 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -521,46 +521,7 @@ splitColData <- function(x, f) { } -#' Retrieve the Original Name of a Function -#' -#' This function identifies the original name of a function object by searching -#' its environment. It works for functions defined in packages, global -#' environments, or custom user-defined functions. -#' -#' @param f A function object whose original name you want to retrieve. -#' -#' @return A character string with the name of the function if found, or `NULL` -#' if the name cannot be determined (e.g., for anonymous functions). -#' -#' @details The function works by inspecting the environment where the function -#' is defined. It searches for objects that are identical to the provided -#' function and returns the name of the first match. For package functions, -#' the search is limited to the package's namespace. -#' -#' @examples -#' library(dplyr) -#' fun <- anti_join -#' get_function_name(fun) # Returns "anti_join" -#' -#' custom_fun <- function(x) x^2 -#' get_function_name(custom_fun) # Returns "custom_fun" -#' -#' # Anonymous function -#' get_function_name(function(x) x^2) # Returns NULL -#' -#' @importFrom base identical ls get -#' @noRd -get_function_name <- function(f) { - if (is.function(f)) { - env <- environment(f) # The environment where the function is defined - all_names <- ls(env) # List all names in that environment - matches <- all_names[sapply(all_names, function(n) identical(get(n, envir = env), f))] - if (length(matches) > 0) { - return(matches[1]) # Return the first match - } - } - return(NULL) -} + cell__ <- get_special_column_name_symbol(".cell") feature__ <- get_special_column_name_symbol(".feature") diff --git a/man/anti_join.Rd b/man/anti_join.Rd index 522dd0e..49a5226 100644 --- a/man/anti_join.Rd +++ b/man/anti_join.Rd @@ -5,7 +5,7 @@ \alias{anti_join.SingleCellExperiment} \title{Filtering joins} \usage{ -\method{anti_join}{SingleCellExperiment}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +\method{anti_join}{SingleCellExperiment}(x, y, by = NULL, copy = FALSE, ...) } \arguments{ \item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or