-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add input checks in pre-msa-tree #74
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
a683ac0
add input checks in pre-msa-tree
awasyn 11d01f8
Merge branch 'main' into premsaerror
awasyn e7e8212
Merge branch 'JRaviLab:main' into premsaerror
awasyn 77ca376
Merge branch 'main' into premsaerror
the-mayer 6949a68
swap rlang::abort() for stop()
the-mayer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,13 +45,20 @@ api_key <- Sys.getenv("ENTREZ_API_KEY", unset = "YOUR_KEY_HERE") | |
#' @param x Character vector. | ||
#' @param y Delimitter. Default is space (" "). | ||
#' | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return A character vector in title case. | ||
#' @export | ||
#' | ||
#' @examples | ||
#' convert2TitleCase("hello world") | ||
#' convert2TitleCase("this is a test", "_") | ||
|
||
convert2TitleCase <- function(x, y = " ") { | ||
# Check if the input is NULL or not a character | ||
if (is.null(x) || !is.character(x)) { | ||
abort("Error: Input must be a non-null character string.") | ||
} | ||
s <- strsplit(x, y)[[1]] | ||
paste(toupper(substring(s, 1, 1)), substring(s, 2), | ||
sep = "", collapse = y | ||
|
@@ -88,6 +95,7 @@ convert2TitleCase <- function(x, y = " ") { | |
#' @importFrom stats complete.cases | ||
#' @importFrom stringr str_sub | ||
#' @importFrom tidyr replace_na separate | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return A data frame containing the combined alignment and lineage | ||
#' information. | ||
|
@@ -104,6 +112,25 @@ addLeaves2Alignment <- function(aln_file = "", | |
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! | ||
# lin_file="data/rawdata_tsv/PspA.txt", | ||
reduced = FALSE) { | ||
|
||
#Check if the alignment file is provided and exists | ||
if (nchar(aln_file) == 0) { | ||
abort("Error: Alignment file path must be provided.") | ||
} | ||
|
||
if (!file.exists(aln_file)) { | ||
abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) | ||
} | ||
|
||
# Check if the lineage file exists | ||
if (!file.exists(lin_file)) { | ||
abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) | ||
} | ||
|
||
# Check that the 'reduced' parameter is logical | ||
if (!is.logical(reduced) || length(reduced) != 1) { | ||
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") | ||
} | ||
## SAMPLE ARGS | ||
# aln_file <- "data/rawdata_aln/pspc.gismo.aln" | ||
# lin_file <- "data/rawdata_tsv/all_semiclean.txt" | ||
|
@@ -206,7 +233,7 @@ addLeaves2Alignment <- function(aln_file = "", | |
#' @importFrom dplyr mutate pull select | ||
#' @importFrom stringi stri_replace_all_regex | ||
#' @importFrom tidyr separate | ||
#' @importFrom rlang sym | ||
#' @importFrom rlang abort sym | ||
#' | ||
#' @return Original data with a 'Name' column | ||
#' @export | ||
|
@@ -218,6 +245,19 @@ addLeaves2Alignment <- function(aln_file = "", | |
addName <- function(data, | ||
accnum_col = "AccNum", spec_col = "Species", lin_col = "Lineage", | ||
lin_sep = ">", out_col = "Name") { | ||
# Check if the data is a data fram | ||
if (!is.data.frame(data)) { | ||
abort("Error: The input 'data' must be a data frame") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing full stop |
||
} | ||
|
||
# Check that the specified columns exist in the data | ||
required_cols <- c(accnum_col, spec_col, lin_col) | ||
missing_cols <- setdiff(required_cols, names(data)) | ||
if (length(missing_cols) > 0) { | ||
abort(paste("Error: The following columns are missing from the data:", | ||
paste(missing_cols, collapse = ", "))) | ||
} | ||
|
||
cols <- c(accnum_col, "Kingdom", "Phylum", "Genus", "Spp") | ||
split_data <- data %>% | ||
separate( | ||
|
@@ -288,6 +328,7 @@ addName <- function(data, | |
#' file formats and/or column names. | ||
#' | ||
#' @importFrom readr write_file | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return Character string containing the Fasta formatted sequences. | ||
#' If `fa_outpath` is specified, the function also writes the sequences to the | ||
|
@@ -302,6 +343,24 @@ convertAlignment2FA <- function(aln_file = "", | |
lin_file = "data/rawdata_tsv/all_semiclean.txt", # !! finally change to all_clean.txt!! | ||
fa_outpath = "", | ||
reduced = FALSE) { | ||
#Check if the alignment file is provided and exists | ||
if (nchar(aln_file) == 0) { | ||
abort("Error: Alignment file path must be provided.") | ||
} | ||
|
||
if (!file.exists(aln_file)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. space Qs here, too. |
||
abort(paste("Error: The alignment file '", aln_file, "' does not exist.")) | ||
} | ||
|
||
# Check if the lineage file exists | ||
if (!file.exists(lin_file)) { | ||
abort(paste("Error: The lineage file '", lin_file, "' does not exist.")) | ||
} | ||
|
||
# Check that the 'reduced' parameter is logical | ||
if (!is.logical(reduced) || length(reduced) != 1) { | ||
abort("Error: 'reduced' must be a single logical value (TRUE or FALSE).") | ||
} | ||
## SAMPLE ARGS | ||
# aln_file <- "data/rawdata_aln/pspc.gismo.aln" | ||
# lin_file <- "data/rawdata_tsv/all_semiclean.txt" | ||
|
@@ -345,7 +404,7 @@ convertAlignment2FA <- function(aln_file = "", | |
#' | ||
#' @importFrom dplyr filter pull | ||
#' @importFrom stringr str_locate | ||
#' @importFrom rlang sym | ||
#' @importFrom rlang abort sym | ||
#' | ||
#' @return Character string. The modified line from the Fasta file header with | ||
#' the name instead of the accession number. | ||
|
@@ -359,7 +418,23 @@ convertAlignment2FA <- function(aln_file = "", | |
#' mapped_line <- mapAcc2Name(line, acc2name_table) | ||
#' mapped_line # Expected output: ">Species A" | ||
#' } | ||
|
||
mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { | ||
# Check if acc2name is a data frame | ||
if (!is.data.frame(acc2name)) { | ||
abort("Error: acc2name must be a data frame.") | ||
} | ||
|
||
# Check if the specified columns exist in the data frame | ||
if (!(acc_col %in% colnames(acc2name))) { | ||
abort("Error: The specified acc_col '", acc_col, "' does not exist in | ||
acc2name.") | ||
} | ||
if (!(name_col %in% colnames(acc2name))) { | ||
abort("Error: The specified name_col '", name_col, "' does not exist in | ||
acc2name.") | ||
} | ||
|
||
# change to be the name equivalent to an add_names column | ||
# Find the first ' ' | ||
end_acc <- str_locate(line, " ")[[1]] | ||
|
@@ -383,6 +458,7 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { | |
#' | ||
#' @importFrom purrr map | ||
#' @importFrom readr read_lines write_lines | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return Character vector containing the modified lines of the Fasta file. | ||
#' @export | ||
|
@@ -394,6 +470,17 @@ mapAcc2Name <- function(line, acc2name, acc_col = "AccNum", name_col = "Name") { | |
#' } | ||
rename_fasta <- function(fa_path, outpath, | ||
replacement_function = map_acc2name, ...) { | ||
# Check if the input FASTA file exists | ||
if (!file.exists(fa_path)) { | ||
abort("Error: The input FASTA file does not exist at the specified | ||
path: ", fa_path) | ||
} | ||
|
||
# Check if the output path is writable | ||
outdir <- dirname(outpath) | ||
if (!dir.exists(outdir)) { | ||
abort("Error: The output directory does not exist: ", outdir) | ||
} | ||
lines <- read_lines(fa_path) | ||
res <- map(lines, function(x) { | ||
if (strtrim(x, 1) == ">") { | ||
|
@@ -431,6 +518,7 @@ rename_fasta <- function(fa_path, outpath, | |
#' | ||
#' @importFrom purrr pmap | ||
#' @importFrom stringr str_replace_all | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return NULL. The function saves the output FASTA files to the specified | ||
#' directory. | ||
|
@@ -451,6 +539,24 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), | |
fa_outpath = here("data/alns/"), | ||
lin_file = here("data/rawdata_tsv/all_semiclean.txt"), | ||
reduced = F) { | ||
# Check if the alignment path exists | ||
if (!dir.exists(aln_path)) { | ||
abort("Error: The alignment directory does not exist at the specified | ||
path: ", aln_path) | ||
} | ||
|
||
# Check if the output path exists; if not, attempt to create it | ||
if (!dir.exists(fa_outpath)) { | ||
dir.create(fa_outpath, recursive = TRUE) | ||
message("Note: The output directory did not exist and has been created: ", | ||
fa_outpath) | ||
} | ||
|
||
# Check if the linear file exists | ||
if (!file.exists(lin_file)) { | ||
abort("Error: The linear file does not exist at the specified path: ", | ||
lin_file) | ||
} | ||
# library(here) | ||
# library(tidyverse) | ||
# aln_path <- here("data/rawdata_aln/") | ||
|
@@ -502,6 +608,7 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), | |
#' @importFrom future future plan | ||
#' @importFrom purrr map | ||
#' @importFrom rentrez entrez_fetch | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return A Fasta file is written to the specified `outpath`. | ||
#' @export | ||
|
@@ -514,7 +621,16 @@ generateAllAlignments2FA <- function(aln_path = here("data/rawdata_aln/"), | |
#' EBI:accessions <- c("P12345", "Q9UHC1", "O15530", "Q14624", "P0DTD1") |> | ||
#' acc2FA(outpath = "ebi.fa") | ||
#' } | ||
|
||
acc2FA <- function(accessions, outpath, plan = "sequential") { | ||
if (!is.character(accessions) || length(accessions) == 0) { | ||
abort("Error: 'accessions' must be a non-empty character vector.") | ||
} | ||
|
||
if (!dir.exists(dirname(outpath))) { | ||
abort("Error: The output directory does not exist: ", dirname(outpath)) | ||
} | ||
|
||
# validation | ||
stopifnot(length(accessions) > 0) | ||
|
||
|
@@ -599,7 +715,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { | |
#' @param accnum_col Column from prot_data that contains Accession Numbers | ||
#' | ||
#' @importFrom dplyr filter pull | ||
#' @importFrom rlang sym | ||
#' @importFrom rlang abort sym | ||
#' | ||
#' @return A character vector containing representative accession numbers, | ||
#' one for each distinct observation in the specified 'reduced' column. | ||
|
@@ -616,8 +732,25 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { | |
createRepresentativeAccNum <- function(prot_data, | ||
reduced = "Lineage", | ||
accnum_col = "AccNum") { | ||
# Get Unique reduced column and then bind the AccNums back to get one | ||
# AccNum per reduced column | ||
|
||
# Validate input | ||
if (!is.data.frame(prot_data)) { | ||
abort("Error: 'prot_data' must be a data frame.") | ||
} | ||
|
||
# Check if the reduced column exists in prot_data | ||
if (!(reduced %in% colnames(prot_data))) { | ||
abort("Error: The specified reduced column '", reduced, "' does not | ||
exist in the data frame.") | ||
} | ||
|
||
# Check if the accnum_col exists in prot_data | ||
if (!(accnum_col %in% colnames(prot_data))) { | ||
abort("Error: The specified accession number column '", accnum_col, "' | ||
does not exist in the data frame.") | ||
} | ||
# Get Unique reduced column and then bind the AccNums back to get one AccNum per reduced column | ||
|
||
reduced_sym <- sym(reduced) | ||
accnum_sym <- sym(accnum_col) | ||
|
||
|
@@ -658,6 +791,7 @@ createRepresentativeAccNum <- function(prot_data, | |
#' | ||
#' @importFrom Biostrings readAAStringSet | ||
#' @importFrom msa msaMuscle msaClustalOmega msaClustalW | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return aligned fasta sequence as a MsaAAMultipleAlignment object | ||
#' @export | ||
|
@@ -670,6 +804,14 @@ createRepresentativeAccNum <- function(prot_data, | |
#' aligned_sequences | ||
#' } | ||
alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { | ||
# Validate the input FASTA file | ||
if (!file.exists(fasta_file)) { | ||
abort("Error: The FASTA file does not exist: ", fasta_file) | ||
} | ||
|
||
if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { | ||
abort("Error: The specified file is not a valid FASTA file: ", fasta_file) | ||
} | ||
fasta <- readAAStringSet(fasta_file) | ||
|
||
aligned <- switch(tool, | ||
|
@@ -698,6 +840,7 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { | |
#' | ||
#' @importFrom Biostrings unmasked | ||
#' @importFrom readr write_file | ||
#' @importFrom rlang abort | ||
#' | ||
#' @return Character string of the FASTA content that was written to the file. | ||
#' @export | ||
|
@@ -708,7 +851,24 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { | |
#' alignment <- alignFasta("path/to/sequences.fasta") | ||
#' writeMSA_AA2FA(alignment, "path/to/aligned_sequences.fasta") | ||
#' } | ||
|
||
writeMSA_AA2FA <- function(alignment, outpath) { | ||
# Validate input alignment | ||
if (!inherits(alignment, "AAMultipleAlignment")) { | ||
abort("Error: The alignment must be of type 'AAMultipleAlignment'.") | ||
} | ||
|
||
# Check the output path is a character string | ||
if (!is.character(outpath) || nchar(outpath) == 0) { | ||
abort("Error: Invalid output path specified.") | ||
} | ||
|
||
# Check if the output directory exists | ||
outdir <- dirname(outpath) | ||
if (!dir.exists(outdir)) { | ||
abort("Error: The output directory does not exist: ", outdir) | ||
} | ||
|
||
l <- length(rownames(alignment)) | ||
fasta <- "" | ||
for (i in 1:l) | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we remove extra spaces or use paste0 if we're introducing spaces? [or do the tests work out OK?]