Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add input checks in pre-msa-tree #74

Merged
merged 5 commits into from
Oct 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 165 additions & 5 deletions R/pre-msa-tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,20 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE")
#' @param x Character vector.
#' @param y Delimitter. Default is space (" ").
#'
#' @importFrom rlang abort
#'
#' @return A character vector in title case.
#' @export
#'
#' @examples
#' convert2TitleCase("hello world")
#' convert2TitleCase("this is a test", "_")

convert2TitleCase <- function(x, y = " ") {
# Check if the input is NULL or not a character
if (is.null(x) || !is.character(x)) {
abort("Error: Input must be a non-null character string.")
}
s <- strsplit(x, y)[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep = "", collapse = y
Expand Down Expand Up @@ -88,6 +95,7 @@ convert2TitleCase <- function(x, y = " ") {
#' @importFrom stats complete.cases
#' @importFrom stringr str_sub
#' @importFrom tidyr replace_na separate
#' @importFrom rlang abort
#'
#' @return A data frame containing the combined alignment and lineage
#' information.
Expand All @@ -104,6 +112,25 @@ addLeaves2Alignment <- function(aln_file = "",
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
# lin_file="data/rawdata_tsv/PspA.txt",
reduced = FALSE) {

#Check if the alignment file is provided and exists
if (nchar(aln_file) == 0) {
abort("Error: Alignment file path must be provided.")
}

if (!file.exists(aln_file)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we remove extra spaces or use paste0 if we're introducing spaces? [or do the tests work out OK?]

abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
}

# Check if the lineage file exists
if (!file.exists(lin_file)) {
abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
}

# Check that the 'reduced' parameter is logical
if (!is.logical(reduced) || length(reduced) != 1) {
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
}
## SAMPLE ARGS
# aln_file <- "data/rawdata_aln/pspc.gismo.aln"
# lin_file <- "data/rawdata_tsv/all_semiclean.txt"
Expand Down Expand Up @@ -206,7 +233,7 @@ addLeaves2Alignment <- function(aln_file = "",
#' @importFrom dplyr mutate pull select
#' @importFrom stringi stri_replace_all_regex
#' @importFrom tidyr separate
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return Original data with a 'Name' column
#' @export
Expand All @@ -218,6 +245,19 @@ addLeaves2Alignment <- function(aln_file = "",
addName <- function(data,
accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage",
lin_sep = ">", out_col = "Name") {
# Check if the data is a data fram
if (!is.data.frame(data)) {
abort("Error: The input 'data' must be a data frame")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing full stop

}

# Check that the specified columns exist in the data
required_cols <- c(accnum_col, spec_col, lin_col)
missing_cols <- setdiff(required_cols, names(data))
if (length(missing_cols) > 0) {
abort(paste("Error: The following columns are missing from the data:",
paste(missing_cols, collapse = ", ")))
}

cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp")
split_data <- data %>%
separate(
Expand Down Expand Up @@ -288,6 +328,7 @@ addName <- function(data,
#' file formats and/or column names.
#'
#' @importFrom readr write_file
#' @importFrom rlang abort
#'
#' @return Character string containing the Fasta formatted sequences.
#' If `fa_outpath` is specified, the function also writes the sequences to the
Expand All @@ -302,6 +343,24 @@ convertAlignment2FA <- function(aln_file = "",
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!!
fa_outpath = "",
reduced = FALSE) {
#Check if the alignment file is provided and exists
if (nchar(aln_file) == 0) {
abort("Error: Alignment file path must be provided.")
}

if (!file.exists(aln_file)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space Qs here, too.

abort(paste("Error: The alignment file '", aln_file, "' does not exist."))
}

# Check if the lineage file exists
if (!file.exists(lin_file)) {
abort(paste("Error: The lineage file '", lin_file, "' does not exist."))
}

# Check that the 'reduced' parameter is logical
if (!is.logical(reduced) || length(reduced) != 1) {
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).")
}
## SAMPLE ARGS
# aln_file <- "data/rawdata_aln/pspc.gismo.aln"
# lin_file <- "data/rawdata_tsv/all_semiclean.txt"
Expand Down Expand Up @@ -345,7 +404,7 @@ convertAlignment2FA <- function(aln_file = "",
#'
#' @importFrom dplyr filter pull
#' @importFrom stringr str_locate
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return Character string. The modified line from the Fasta file header with
#' the name instead of the accession number.
Expand All @@ -359,7 +418,23 @@ convertAlignment2FA <- function(aln_file = "",
#' mapped_line <- mapAcc2Name(line, acc2name_table)
#' mapped_line # Expected output: ">Species A"
#' }

mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
# Check if acc2name is a data frame
if (!is.data.frame(acc2name)) {
abort("Error: acc2name must be a data frame.")
}

# Check if the specified columns exist in the data frame
if (!(acc_col %in% colnames(acc2name))) {
abort("Error: The specified acc_col '", acc_col, "' does not exist in
acc2name.")
}
if (!(name_col %in% colnames(acc2name))) {
abort("Error: The specified name_col '", name_col, "' does not exist in
acc2name.")
}

# change to be the name equivalent to an add_names column
# Find the first ' '
end_acc <- str_locate(line, " ")[[1]]
Expand All @@ -383,6 +458,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#'
#' @importFrom purrr map
#' @importFrom readr read_lines write_lines
#' @importFrom rlang abort
#'
#' @return Character vector containing the modified lines of the Fasta file.
#' @export
Expand All @@ -394,6 +470,17 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") {
#' }
rename_fasta <- function(fa_path, outpath,
replacement_function = map_acc2name, ...) {
# Check if the input FASTA file exists
if (!file.exists(fa_path)) {
abort("Error: The input FASTA file does not exist at the specified
path: ", fa_path)
}

# Check if the output path is writable
outdir <- dirname(outpath)
if (!dir.exists(outdir)) {
abort("Error: The output directory does not exist: ", outdir)
}
lines <- read_lines(fa_path)
res <- map(lines, function(x) {
if (strtrim(x, 1) == ">") {
Expand Down Expand Up @@ -431,6 +518,7 @@ rename_fasta <- function(fa_path, outpath,
#'
#' @importFrom purrr pmap
#' @importFrom stringr str_replace_all
#' @importFrom rlang abort
#'
#' @return NULL. The function saves the output FASTA files to the specified
#' directory.
Expand All @@ -451,6 +539,24 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
fa_outpath = here("data/alns/"),
lin_file = here("data/rawdata_tsv/all_semiclean.txt"),
reduced = F) {
# Check if the alignment path exists
if (!dir.exists(aln_path)) {
abort("Error: The alignment directory does not exist at the specified
path: ", aln_path)
}

# Check if the output path exists; if not, attempt to create it
if (!dir.exists(fa_outpath)) {
dir.create(fa_outpath, recursive = TRUE)
message("Note: The output directory did not exist and has been created: ",
fa_outpath)
}

# Check if the linear file exists
if (!file.exists(lin_file)) {
abort("Error: The linear file does not exist at the specified path: ",
lin_file)
}
# library(here)
# library(tidyverse)
# aln_path <- here("data/rawdata_aln/")
Expand Down Expand Up @@ -502,6 +608,7 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
#' @importFrom future future plan
#' @importFrom purrr map
#' @importFrom rentrez entrez_fetch
#' @importFrom rlang abort
#'
#' @return A Fasta file is written to the specified `outpath`.
#' @export
Expand All @@ -514,7 +621,16 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"),
#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |>
#' acc2FA(outpath = "ebi.fa")
#' }

acc2FA <- function(accessions, outpath, plan = "sequential") {
if (!is.character(accessions) || length(accessions) == 0) {
abort("Error: 'accessions' must be a non-empty character vector.")
}

if (!dir.exists(dirname(outpath))) {
abort("Error: The output directory does not exist: ", dirname(outpath))
}

# validation
stopifnot(length(accessions) > 0)

Expand Down Expand Up @@ -599,7 +715,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
#' @param accnum_col Column from prot_data that contains Accession Numbers
#'
#' @importFrom dplyr filter pull
#' @importFrom rlang sym
#' @importFrom rlang abort sym
#'
#' @return A character vector containing representative accession numbers,
#' one for each distinct observation in the specified 'reduced' column.
Expand All @@ -616,8 +732,25 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
createRepresentativeAccNum <- function(prot_data,
reduced = "Lineage",
accnum_col = "AccNum") {
# Get Unique reduced column and then bind the AccNums back to get one
# AccNum per reduced column

# Validate input
if (!is.data.frame(prot_data)) {
abort("Error: 'prot_data' must be a data frame.")
}

# Check if the reduced column exists in prot_data
if (!(reduced %in% colnames(prot_data))) {
abort("Error: The specified reduced column '", reduced, "' does not
exist in the data frame.")
}

# Check if the accnum_col exists in prot_data
if (!(accnum_col %in% colnames(prot_data))) {
abort("Error: The specified accession number column '", accnum_col, "'
does not exist in the data frame.")
}
# Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column

reduced_sym <- sym(reduced)
accnum_sym <- sym(accnum_col)

Expand Down Expand Up @@ -658,6 +791,7 @@ createRepresentativeAccNum <- function(prot_data,
#'
#' @importFrom Biostrings readAAStringSet
#' @importFrom msa msaMuscle msaClustalOmega msaClustalW
#' @importFrom rlang abort
#'
#' @return aligned fasta sequence as a MsaAAMultipleAlignment object
#' @export
Expand All @@ -670,6 +804,14 @@ createRepresentativeAccNum <- function(prot_data,
#' aligned_sequences
#' }
alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
# Validate the input FASTA file
if (!file.exists(fasta_file)) {
abort("Error: The FASTA file does not exist: ", fasta_file)
}

if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
}
fasta <- readAAStringSet(fasta_file)

aligned <- switch(tool,
Expand Down Expand Up @@ -698,6 +840,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#'
#' @importFrom Biostrings unmasked
#' @importFrom readr write_file
#' @importFrom rlang abort
#'
#' @return Character string of the FASTA content that was written to the file.
#' @export
Expand All @@ -708,7 +851,24 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
#' alignment <- alignFasta("path/to/sequences.fasta")
#' writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta")
#' }

writeMSA_AA2FA <- function(alignment, outpath) {
# Validate input alignment
if (!inherits(alignment, "AAMultipleAlignment")) {
abort("Error: The alignment must be of type 'AAMultipleAlignment'.")
}

# Check the output path is a character string
if (!is.character(outpath) || nchar(outpath) == 0) {
abort("Error: Invalid output path specified.")
}

# Check if the output directory exists
outdir <- dirname(outpath)
if (!dir.exists(outdir)) {
abort("Error: The output directory does not exist: ", outdir)
}

l <- length(rownames(alignment))
fasta <- ""
for (i in 1:l)
Expand Down
Loading