JRaviLab · the-mayer · Oct 30, 2024 · Oct 7, 2024 · Oct 8, 2024 · Oct 10, 2024
diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R
@@ -45,13 +45,20 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
 #' @param x Character vector.
 #' @param y Delimitter. Default is space (" ").
 #'
+#' @importFrom rlang abort
+#' 
 #' @return A character vector in title case.
 #' @export
 #'
 #' @examples
 #' convert2TitleCase("hello world")
 #' convert2TitleCase("this is a test", "_")
+
 convert2TitleCase <- function(x, y = " ") {
+    # Check if the input is NULL or not a character
+    if (is.null(x) || !is.character(x)) {
+        abort("Error: Input must be a non-null character string.")
+    }
     s <- strsplit(x, y)[[1]]
     paste(toupper(substring(s, 1, 1)), substring(s, 2),
         sep = "", collapse = y
@@ -88,6 +95,7 @@ convert2TitleCase <- function(x, y = " ") {
 #' @importFrom stats complete.cases
 #' @importFrom stringr str_sub
 #' @importFrom tidyr replace_na separate
+#' @importFrom rlang abort
 #'
 #' @return A data frame containing the combined alignment and lineage
 #' information.
@@ -104,6 +112,25 @@ addLeaves2Alignment <- function(aln_file = "",
     lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
     # lin_file="data/rawdata_tsv/PspA.txt",
     reduced = FALSE) {
+
+    #Check if the alignment file is provided and exists
+    if (nchar(aln_file) == 0) {
+        abort("Error: Alignment file path must be provided.")
+    }
+
+    if (!file.exists(aln_file)) {
+        abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
+    }
+
+    # Check if the lineage file exists
+    if (!file.exists(lin_file)) {
+        abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
+    }
+
+    # Check that the 'reduced' parameter is logical
+    if (!is.logical(reduced) || length(reduced) != 1) {
+        abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
+    }
     ## SAMPLE ARGS
     # aln_file <- "data/rawdata_aln/pspc.gismo.aln"
     # lin_file <- "data/rawdata_tsv/all_semiclean.txt"
@@ -206,7 +233,7 @@ addLeaves2Alignment <- function(aln_file = "",
 #' @importFrom dplyr mutate pull select
 #' @importFrom stringi stri_replace_all_regex
 #' @importFrom tidyr separate
-#' @importFrom rlang sym
+#' @importFrom rlang abort sym
 #'
 #' @return Original data with a 'Name' column
 #' @export
@@ -218,6 +245,19 @@ addLeaves2Alignment <- function(aln_file = "",
 addName <- function(data,
     accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage",
     lin_sep = ">", out_col = "Name") {
+    # Check if the data is a data fram
+    if (!is.data.frame(data)) {
+        abort("Error: The input 'data' must be a data frame")
+    }
+
+    # Check that the specified columns exist in the data
+    required_cols <- c(accnum_col, spec_col, lin_col)
+    missing_cols <- setdiff(required_cols, names(data))
+    if (length(missing_cols) > 0) {
+        abort(paste("Error: The following columns are missing from the data:", 
+                   paste(missing_cols, collapse = ", ")))
+    }
+
     cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp")
     split_data <- data %>%
         separate(
@@ -288,6 +328,7 @@ addName <- function(data,
 #' file formats and/or column names.
 #'
 #' @importFrom readr write_file
+#' @importFrom rlang abort
 #'
 #' @return Character string containing the Fasta formatted sequences.
 #' If `fa_outpath` is specified, the function also writes the sequences to the
@@ -302,6 +343,24 @@ convertAlignment2FA <- function(aln_file = "",
     lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
     fa_outpath = "",
     reduced = FALSE) {
+    #Check if the alignment file is provided and exists
+    if (nchar(aln_file) == 0) {
+        abort("Error: Alignment file path must be provided.")
+    }
+
+    if (!file.exists(aln_file)) {
+        abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
+    }
+
+    # Check if the lineage file exists
+    if (!file.exists(lin_file)) {
+        abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
+    }
+
+    # Check that the 'reduced' parameter is logical
+    if (!is.logical(reduced) || length(reduced) != 1) {
+        abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
+    }
     ## SAMPLE ARGS
     # aln_file <- "data/rawdata_aln/pspc.gismo.aln"
     # lin_file <- "data/rawdata_tsv/all_semiclean.txt"
@@ -345,7 +404,7 @@ convertAlignment2FA <- function(aln_file = "",
 #'
 #' @importFrom dplyr filter pull
 #' @importFrom stringr str_locate
-#' @importFrom rlang sym
+#' @importFrom rlang abort sym
 #'
 #' @return Character string. The modified line from the Fasta file header with
 #' the name instead of the accession number.
@@ -359,7 +418,23 @@ convertAlignment2FA <- function(aln_file = "",
 #' mapped_line <- mapAcc2Name(line, acc2name_table)
 #' mapped_line  # Expected output: ">Species A"
 #' }
+
 mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
+    # Check if acc2name is a data frame
+    if (!is.data.frame(acc2name)) {
+        abort("Error: acc2name must be a data frame.")
+    }
+
+    # Check if the specified columns exist in the data frame
+    if (!(acc_col %in% colnames(acc2name))) {
+        abort("Error: The specified acc_col '", acc_col, "' does not exist in 
+             acc2name.")
+    }
+    if (!(name_col %in% colnames(acc2name))) {
+        abort("Error: The specified name_col '", name_col, "' does not exist in 
+             acc2name.")
+    }
+
     # change to be the name equivalent to an add_names column
     # Find the first ' '
     end_acc <- str_locate(line, " ")[[1]]
@@ -383,6 +458,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
 #'
 #' @importFrom purrr map
 #' @importFrom readr read_lines write_lines
+#' @importFrom rlang abort
 #'
 #' @return Character vector containing the modified lines of the Fasta file.
 #' @export
@@ -394,6 +470,17 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
 #' }
 rename_fasta <- function(fa_path, outpath,
     replacement_function = map_acc2name, ...) {
+    # Check if the input FASTA file exists
+    if (!file.exists(fa_path)) {
+        abort("Error: The input FASTA file does not exist at the specified
+             path: ", fa_path)
+    }
+
+    # Check if the output path is writable
+    outdir <- dirname(outpath)
+    if (!dir.exists(outdir)) {
+        abort("Error: The output directory does not exist: ", outdir)
+    }
     lines <- read_lines(fa_path)
     res <- map(lines, function(x) {
         if (strtrim(x, 1) == ">") {
@@ -431,6 +518,7 @@ rename_fasta <- function(fa_path, outpath,
 #'
 #' @importFrom purrr pmap
 #' @importFrom stringr str_replace_all
+#' @importFrom rlang abort
 #'
 #' @return NULL. The function saves the output FASTA files to the specified
 #' directory.
@@ -451,6 +539,24 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
     fa_outpath = here("data/alns/"),
     lin_file = here("data/rawdata_tsv/all_semiclean.txt"),
     reduced = F) {
+    # Check if the alignment path exists
+    if (!dir.exists(aln_path)) {
+        abort("Error: The alignment directory does not exist at the specified 
+             path: ", aln_path)
+    }
+
+    # Check if the output path exists; if not, attempt to create it
+    if (!dir.exists(fa_outpath)) {
+        dir.create(fa_outpath, recursive = TRUE)
+        message("Note: The output directory did not exist and has been created: ", 
+                fa_outpath)
+    }
+
+    # Check if the linear file exists
+    if (!file.exists(lin_file)) {
+        abort("Error: The linear file does not exist at the specified path: ", 
+             lin_file)
+    }
     # library(here)
     # library(tidyverse)
     # aln_path <- here("data/rawdata_aln/")
@@ -502,6 +608,7 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
 #' @importFrom future future plan
 #' @importFrom purrr map
 #' @importFrom rentrez entrez_fetch
+#' @importFrom rlang abort
 #'
 #' @return A Fasta file is written to the specified `outpath`.
 #' @export
@@ -514,7 +621,16 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
 #' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |>
 #' acc2FA(outpath = "ebi.fa")
 #' }
+
 acc2FA <- function(accessions, outpath, plan = "sequential") {
+    if (!is.character(accessions) || length(accessions) == 0) {
+        abort("Error: 'accessions' must be a non-empty character vector.")
+    }
+
+    if (!dir.exists(dirname(outpath))) {
+        abort("Error: The output directory does not exist: ", dirname(outpath))
+    }
+
     # validation
     stopifnot(length(accessions) > 0)
 
@@ -599,7 +715,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
 #' @param accnum_col Column from prot_data that contains Accession Numbers
 #'
 #' @importFrom dplyr filter pull
-#' @importFrom rlang sym
+#' @importFrom rlang abort sym
 #'
 #' @return A character vector containing representative accession numbers,
 #' one for each distinct observation in the specified 'reduced' column.
@@ -616,8 +732,25 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
 createRepresentativeAccNum <- function(prot_data,
     reduced = "Lineage",
     accnum_col = "AccNum") {
-    # Get Unique reduced column and then bind the AccNums back to get one
-    # AccNum per reduced column
+
+    # Validate input
+    if (!is.data.frame(prot_data)) {
+        abort("Error: 'prot_data' must be a data frame.")
+    }
+
+    # Check if the reduced column exists in prot_data
+    if (!(reduced %in% colnames(prot_data))) {
+        abort("Error: The specified reduced column '", reduced, "' does not 
+             exist in the data frame.")
+    }
+
+    # Check if the accnum_col exists in prot_data
+    if (!(accnum_col %in% colnames(prot_data))) {
+        abort("Error: The specified accession number column '", accnum_col, "' 
+             does not exist in the data frame.")
+    }
+    # Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column
+
     reduced_sym <- sym(reduced)
     accnum_sym <- sym(accnum_col)
 
@@ -658,6 +791,7 @@ createRepresentativeAccNum <- function(prot_data,
 #'
 #' @importFrom Biostrings readAAStringSet
 #' @importFrom msa msaMuscle msaClustalOmega msaClustalW
+#' @importFrom rlang abort
 #'
 #' @return aligned fasta sequence as a MsaAAMultipleAlignment object
 #' @export
@@ -670,6 +804,14 @@ createRepresentativeAccNum <- function(prot_data,
 #' aligned_sequences
 #' }
 alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
+    # Validate the input FASTA file
+    if (!file.exists(fasta_file)) {
+        abort("Error: The FASTA file does not exist: ", fasta_file)
+    }
+
+    if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
+        abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
+    }
     fasta <- readAAStringSet(fasta_file)
 
     aligned <- switch(tool,
@@ -698,6 +840,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
 #'
 #' @importFrom Biostrings unmasked
 #' @importFrom readr write_file
+#' @importFrom rlang abort
 #'
 #' @return Character string of the FASTA content that was written to the file.
 #' @export
@@ -708,7 +851,24 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
 #' alignment <- alignFasta("path/to/sequences.fasta")
 #' writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta")
 #' }
+
 writeMSA_AA2FA <- function(alignment, outpath) {
+    # Validate input alignment
+    if (!inherits(alignment, "AAMultipleAlignment")) {
+        abort("Error: The alignment must be of type 'AAMultipleAlignment'.")
+    }
+
+    # Check the output path is a character string
+    if (!is.character(outpath) || nchar(outpath) == 0) {
+        abort("Error: Invalid output path specified.")
+    }
+
+    # Check if the output directory exists
+    outdir <- dirname(outpath)
+    if (!dir.exists(outdir)) {
+        abort("Error: The output directory does not exist: ", outdir)
+    }
+
     l <- length(rownames(alignment))
     fasta <- ""
     for (i in 1:l)