From 30d4bf3ab57c6296a81d6f792911c87586ca896e Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Sat, 5 Oct 2024 12:29:37 +0100 Subject: [PATCH 01/19] usethis::pr_init("Implement error handling in acc2lin.R functions - Added validation checks for input parameters (accessions, ipg_file, assembly_path, lineagelookup_path). - Included error messages for missing or invalid inputs and file existence checks. - Wrapped main logic in tryCatch for graceful error handling during execution. ") --- R/acc2lin.R | 267 ++++++++++++++++++++++++++++++++++------------ man/acc2lin.Rd | 3 +- man/efetch_ipg.Rd | 3 +- man/ipg2lin.Rd | 3 +- man/sink.reset.Rd | 1 + 5 files changed, 207 insertions(+), 70 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index f8d71949..dfb33da9 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -10,6 +10,8 @@ #' Sink Reset #' #' @return No return, but run to close all outstanding `sink()`s +#' and handles any errors or warnings that occur during the process. +#' #' @export #' #' @examples @@ -17,9 +19,19 @@ #' sink.reset() #' } sink.reset <- function() { + # Handle all errors and warnings + tryCatch({ for (i in seq_len(sink.number())) { - sink(NULL) + sink(NULL) } + print("All sinks closed") + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("resetSink function execution completed.") + }) } @@ -44,23 +56,61 @@ sink.reset <- function() { #' add_lins() #' } add_lins <- function(df, acc_col = "AccNum", assembly_path, - lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - s_acc_col <- sym(acc_col) - accessions <- df %>% pull(acc_col) - lins <- acc2lin(accessions, assembly_path, lineagelookup_path, ipgout_path, plan) + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + # check for validate inputs + if (!is.data.frame(df)) { + stop("Input 'df' must be a data frame.") + } + + if (!acc_col %in% colnames(df)) { + stop(paste("Column", acc_col, "not found in data frame.")) + } + + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } - # Drop a lot of the unimportant columns for now? will make merging much easier - lins <- lins[, c( + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + tryCatch({ + # Attempt to add lineages + acc_col <- sym(acc_col) + accessions <- df %>% pull(acc_col) + lins <- acc2lin( + accessions, assembly_path, lineagelookup_path, ipgout_path, plan + ) + + # Drop a lot of the unimportant columns for now? + # will make merging much easier + lins <- lins[, c( "Strand", "Start", "Stop", "Nucleotide Accession", "Source", "Id", "Strain" - ) := NULL] - lins <- unique(lins) + ) := NULL] + lins <- unique(lins) + + # dup <- lins %>% group_by(Protein) %>% + # summarize(count = n()) %>% filter(count > 1) %>% + # pull(Protein) - # dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>% - # pull(Protein) + merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) + return(merged) + }, error = function(e) { + print(paste("Error: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("addLineages function execution completed.") + }) - merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) - return(merged) } @@ -68,7 +118,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' #' @author Samuel Chen, Janani Ravi #' -#' @description This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +#' @description This function combines 'efetch_ipg()' +#' and 'ipg2lin()' to map a set #' of protein accessions to their assembly (GCA_ID), tax ID, and lineage. #' #' @param accessions Character vector of protein accessions @@ -76,7 +127,8 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results of the efetch run of the accessions +#' @param ipgout_path Path to write the results +#' of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan #' @@ -87,27 +139,43 @@ add_lins <- function(df, acc_col = "AccNum", assembly_path, #' \dontrun{ #' acc2lin() #' } -acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = NULL, plan = "sequential") { - tmp_ipg <- F - if (is.null(ipgout_path)) { - tmp_ipg <- T - ipgout_path <- tempfile("ipg", fileext = ".txt") - } +acc2lin <- function(accessions, assembly_path, + lineagelookup_path, ipgout_path = NULL, + plan = "sequential") { + tmp_ipg <- F + if (is.null(ipgout_path)) { + tmp_ipg <- T + ipgout_path <- tempfile("ipg", fileext = ".txt") + } + + lins <- NULL + tryCatch({ + # Attempt to fetch IPG efetch_ipg(accessions, out_path = ipgout_path, plan) + # Attempt to process IPG to lineages lins <- ipg2lin(accessions, ipgout_path, assembly_path, lineagelookup_path) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("acc2lin function execution completed.") + }) - if (tmp_ipg) { - unlink(tempdir(), recursive = T) - } - return(lins) + if (tmp_ipg) { + unlink(tempdir(), recursive = T) + } + return(lins) } + #' efetch_ipg #' #' @author Samuel Chen, Janani Ravi #' -#' @description Perform efetch on the ipg database and write the results to out_path +#' @description Perform efetch on the ipg database +#' and write the results to out_path #' #' @param accnums Character vector containing the accession numbers to query on #' the ipg database @@ -126,57 +194,84 @@ acc2lin <- function(accessions, assembly_path, lineagelookup_path, ipgout_path = #' efetch_ipg() #' } efetch_ipg <- function(accnums, out_path, plan = "sequential") { - if (length(accnums) > 0) { - partition <- function(in_data, groups) { - # \\TODO This function should be defined outside of efetch_ipg(). It can be non-exported/internal - # Partition data to limit number of queries per second for rentrez fetch: - # limit of 10/second w/ key - l <- length(in_data) - - partitioned <- list() - for (i in 1:groups) - { - partitioned[[i]] <- in_data[seq.int(i, l, groups)] - } - - return(partitioned) - } + # Argument validation + if (!is.character(accnums) || length(accnums) == 0) { + stop("Error: 'accnums' must be a non-empty character vector.") + } + + if (!is.character(out_path) || nchar(out_path) == 0) { + stop("Error: 'out_path' must be a non-empty string.") + } + + if (!is.function(plan)) { + stop("Error: 'plan' must be a valid plan function.") + } + if (length(accnums) > 0) { + partition <- function(in_data, groups) { + # \\TODO This function should be defined outside of efetch_ipg(). + # It can be non-exported/internal + # Partition data to limit number of queries per second for rentrez fetch: + # limit of 10/second w/ key + l <- length(in_data) - plan(strategy = plan, .skip = T) - - - min_groups <- length(accnums) / 200 - groups <- min(max(min_groups, 15), length(accnums)) - partitioned_acc <- partition(accnums, groups) - sink(out_path) - - a <- future_map(1:length(partitioned_acc), function(x) { - # Avoid hitting the rate API limit - if (x %% 9 == 0) { - Sys.sleep(1) - } - cat( - entrez_fetch( - id = partitioned_acc[[x]], - db = "ipg", - rettype = "xml", - api_key = "YOUR_KEY_HERE" ## Can this be included in public package? - ) - ) - }) - sink(NULL) + partitioned <- list() + for (i in 1:groups){ + partitioned[[i]] <- in_data[seq.int(i, l, groups)] + } + + return(partitioned) } + tryCatch({ + # Set the future plan strategy + plan(strategy = plan, .skip = T) + + + min_groups <- length(accnums) / 200 + groups <- min(max(min_groups, 15), length(accnums)) + partitioned_acc <- partition(accnums, groups) + + # Open the sink to the output path + sink(out_path) + + a <- future_map(1:length(partitioned_acc), function(x) { + # Avoid hitting the rate API limit + if (x %% 9 == 0) { + Sys.sleep(1) + } + cat( + entrez_fetch( + id = partitioned_acc[[x]], + db = "ipg", + rettype = "xml", + api_key = "YOUR_KEY_HERE" ## Can this be included in public package? + ) + ) + }) + sink(NULL) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("efetch_ipg function execution completed.") + }) + } } + + #' ipg2lin #' #' @author Samuel Chen, Janani Ravi #' -#' @description Takes the resulting file of an efetch run on the ipg database and +#' @description Takes the resulting file +#' of an efetch run on the ipg database and #' #' @param accessions Character vector of protein accessions -#' @param ipg_file Filepath to the file containing results of an efetch run on the -#' ipg database. The protein accession in 'accessions' should be contained in this +#' @param ipg_file Filepath to the file +#' containing results of an efetch run on the +#' ipg database. The protein accession in +#' 'accessions' should be contained in this #' file #' @param assembly_path String of the path to the assembly_summary path #' This file can be generated using the "DownloadAssemblySummary()" function @@ -195,16 +290,54 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") { #' } #' ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) { + # Argument validation for accessions + if (!is.character(accessions) || length(accessions) == 0) { + stop("Input 'accessions' must be a non-empty character vector.") + } + + # check for validate inputs + if (!is.character(ipg_file)) { + stop("Input 'ipg_file' must be a character string.") + } + # Ensure paths are character strings + if (!is.character(assembly_path) || !is.character(lineagelookup_path)) { + stop("Both 'assembly_path' and + 'lineagelookup_path' must be character strings.") + } + + # Ensure paths exist + if (!file.exists(assembly_path)) { + stop(paste("Assembly file not found at:", assembly_path)) + } + + if (!file.exists(lineagelookup_path)) { + stop(paste("Lineage lookup file not found at:", lineagelookup_path)) + } + + try({ + # Attempt to read the IPG file ipg_dt <- fread(ipg_file, sep = "\t", fill = T) + # Filter the IPG data table to only include the accessions ipg_dt <- ipg_dt[Protein %in% accessions] + # Rename the 'Assembly' column to 'GCA_ID' ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID") + # Convert the IPG data table to a lineage data table lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path) + + # Filter out rows with missing lineage information lins <- lins[!is.na(Lineage)] %>% unique() return(lins) + }, error = function(e) { + print(paste("An error occurred: ", e$message)) + }, warning = function(w) { + print(paste("Warning: ", w$message)) + }, finally = { + print("ipg2lin function execution completed.") + }) } diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd index 6255b290..d3f2468b 100644 --- a/man/acc2lin.Rd +++ b/man/acc2lin.Rd @@ -38,7 +38,8 @@ on the ipg database. If NULL, the file will not be written. Defaults to NULL} Describe return, in detail } \description{ -This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set +This function combines 'efetch_ipg()' +and 'ipg2lin()' to map a set of protein accessions to their assembly (GCA_ID), tax ID, and lineage. Function to map protein accession numbers to lineage diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd index ec5b6bcb..1fbb9d92 100644 --- a/man/efetch_ipg.Rd +++ b/man/efetch_ipg.Rd @@ -23,7 +23,8 @@ the ipg database} Describe return, in detail } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd index 3a14eada..453668b0 100644 --- a/man/ipg2lin.Rd +++ b/man/ipg2lin.Rd @@ -38,7 +38,8 @@ This file can be generated using the "DownloadAssemblySummary()" function} Describe return, in detail } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd index a31b841d..64087c49 100644 --- a/man/sink.reset.Rd +++ b/man/sink.reset.Rd @@ -8,6 +8,7 @@ sink.reset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset From 4aeaa113927b6f94b21c9f0dd0956bb7e48004a5 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Mon, 7 Oct 2024 22:50:16 +0100 Subject: [PATCH 02/19] Add error handling to multiple functions - Implement error handling for mapOption2Process, get_proc_medians, write_proc_medians_table, get_proc_weights, advanced_opts2est_walltime, assign_job_queue, and plot_estimated_walltimes . - Validate input arguments for each function to ensure they meet expected criteria. - Use tryCatch blocks to gracefully handle errors and warnings. - Provide informative error messages and detailed logging where appropriate. - Ensure functions fail gracefully and provide useful feedback. Also renamed the functions to the following; assign_job_queue -> assignJobQueue make_opts2procs -> mapOption2Process map_advanced_opts2procs -> mapAdvOption2Process get_proc_medians - calculateProcessRuntime write_proc_medians_table -> writeProcessRuntime2TSV write_proc_medians_yml -> writeProcessRuntime2YML get_proc_weights -> getProcessRuntimeWeights advanced_opts2est_walltime -> calculateEstimatedWallTimeFromOpts plot_estimated_walltimes -> plotEstimatedWallTimes --- NAMESPACE | 26 +- R/assign_job_queue.R | 484 ++++++++++++------ R/clean_clust_file.R | 4 +- R/combine_analysis.R | 4 +- R/combine_files.R | 10 +- R/create_lineage_lookup.R | 6 +- ...{assign_job_queue.Rd => assignJobQueue.Rd} | 13 +- ... calculateEstimatedWallTimeFromOptions.Rd} | 12 +- ..._medians.Rd => calculateProcessRuntime.Rd} | 10 +- ...lean_clust_file.Rd => cleanClusterFile.Rd} | 8 +- man/{combine_files.Rd => combineFiles.Rd} | 6 +- ...combine_full.Rd => combineFullAnalysis.Rd} | 6 +- man/{combine_ipr.Rd => combineIPR.Rd} | 6 +- ...neage_lookup.Rd => createLineageLookup.Rd} | 6 +- ...weights.Rd => getProcessRuntimeWeights.Rd} | 8 +- ..._opts2procs.Rd => mapAdvOption2Process.Rd} | 8 +- ...ake_opts2procs.Rd => mapOption2Process.Rd} | 8 +- ...walltimes.Rd => plotEstimatedWallTimes.Rd} | 11 +- ...ns_table.Rd => writeProcessRuntime2TSV.Rd} | 8 +- ...ans_yml.Rd => writeProcessRuntimeToYML.Rd} | 13 +- 20 files changed, 416 insertions(+), 241 deletions(-) rename man/{assign_job_queue.Rd => assignJobQueue.Rd} (64%) rename man/{advanced_opts2est_walltime.Rd => calculateEstimatedWallTimeFromOptions.Rd} (68%) rename man/{get_proc_medians.Rd => calculateProcessRuntime.Rd} (76%) rename man/{clean_clust_file.Rd => cleanClusterFile.Rd} (82%) rename man/{combine_files.Rd => combineFiles.Rd} (92%) rename man/{combine_full.Rd => combineFullAnalysis.Rd} (69%) rename man/{combine_ipr.Rd => combineIPR.Rd} (74%) rename man/{create_lineage_lookup.Rd => createLineageLookup.Rd} (91%) rename man/{get_proc_weights.Rd => getProcessRuntimeWeights.Rd} (73%) rename man/{map_advanced_opts2procs.Rd => mapAdvOption2Process.Rd} (76%) rename man/{make_opts2procs.Rd => mapOption2Process.Rd} (75%) rename man/{plot_estimated_walltimes.Rd => plotEstimatedWallTimes.Rd} (55%) rename man/{write_proc_medians_table.Rd => writeProcessRuntime2TSV.Rd} (77%) rename man/{write_proc_medians_yml.Rd => writeProcessRuntimeToYML.Rd} (61%) diff --git a/NAMESPACE b/NAMESPACE index 16cf0813..9c038631 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,26 +12,27 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) -export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assign_job_queue) +export(assignJobQueue) +export(calculateEstimatedWallTimeFromOptions) +export(calculateProcessRuntime) export(cleanup_GeneDesc) export(cleanup_clust) export(cleanup_domarch) export(cleanup_gencontext) export(cleanup_lineage) export(cleanup_species) -export(combine_files) -export(combine_full) -export(combine_ipr) +export(combineFiles) +export(combineFullAnalysis) +export(combineIPR) export(convert_aln2fa) export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) +export(createLineageLookup) export(create_all_col_params) -export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) @@ -45,10 +46,9 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) +export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) -export(get_proc_medians) -export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -58,12 +58,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) -export(make_opts2procs) +export(mapAdvOption2Process) +export(mapOption2Process) export(map_acc2name) -export(map_advanced_opts2procs) export(msa_pdf) export(pick_longer_duplicate) -export(plot_estimated_walltimes) +export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) export(remove_astrk) @@ -95,8 +95,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(write_proc_medians_table) -export(write_proc_medians_yml) +export(writeProcessRuntime2TSV) +export(writeProcessRuntimeToYML) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index bc5253d4..f1fcb6db 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,22 +3,32 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") +# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- make_opts2procs +#' example: list_opts2procs <- mapOption2Process #' @export -make_opts2procs <- function() { +mapOption2Process <- function() { + tryCatch({ opts2processes <- list( - "homology_search" = c("dblast", "dblast_cleanup"), - "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), - "always" = c("blast_clust", "clust2table") # processes always present agnostic of advanced options + "homology_search" = c("dblast", "dblast_cleanup"), + "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), + # processes always present agnostic of advanced options + "always" = c("blast_clust", "clust2table") ) return(opts2processes) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("mapOption2Process function execution completed.") + }) + } #' Use MolEvolvR advanced options to get associated processes @@ -30,17 +40,29 @@ make_opts2procs <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- map_advanced_opts2procs(advanced_opts) +#' procs <- mapAdvOption2Process(advanced_opts) #' @export -map_advanced_opts2procs <- function(advanced_opts) { +mapAdvOption2Process <- function(advanced_opts) { + if (!is.character(advanced_opts)) { + stop("Argument must be a character vector!") + } + tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- make_opts2procs() + opts2proc <- mapOption2Process() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run procs <- opts2proc[idx] |> unlist() return(procs) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("mapOption2Process function execution completed.") + }) + } #' Scrape MolEvolvR logs and calculate median processes @@ -58,47 +80,68 @@ map_advanced_opts2procs <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export -get_proc_medians <- function(dir_job_results) { +calculateProcessRuntime <- function(dir_job_results) { + tryCatch({ + # Check if dir_job_results is a character string + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + # Check if dir_job_results exists + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } + source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) # aggregate logs from - path_log_data <- file.path(common_root, "molevol_scripts", "log_data", "prod_logs.rda") + path_log_data <- file.path(common_root, + "molevol_scripts", "log_data", "prod_logs.rda") # ensure the folder exists to the location if (!dir.exists(path_log_data)) { - dir.create(dirname(path_log_data), recursive = TRUE, showWarnings = FALSE) + dir.create(dirname(path_log_data), + recursive = TRUE, showWarnings = FALSE) } # attempt to load pre-generated logdata if (!file.exists(path_log_data)) { - logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) - save(logs, file = path_log_data) + logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) + save(logs, file = path_log_data) } else { - load(path_log_data) # loads the logs object + load(path_log_data) # loads the logs object } df_log <- logs$df_log procs <- c( - "dblast", "dblast_cleanup", "iprscan", - "ipr2lineage", "ipr2da", "blast_clust", - "clust2table" + "dblast", "dblast_cleanup", "iprscan", + "ipr2lineage", "ipr2da", "blast_clust", + "clust2table" ) list_proc_medians <- df_log |> - dplyr::select(dplyr::all_of(procs)) |> - dplyr::summarise( - dplyr::across( - dplyr::everything(), - \(x) median(x, na.rm = TRUE) - ) - ) |> - as.list() + dplyr::select(dplyr::all_of(procs)) |> + dplyr::summarise( + dplyr::across( + dplyr::everything(), + \(x) median(x, na.rm = TRUE) + ) + ) |> + as.list() return(list_proc_medians) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("calculateProcessRuntime function execution completed.") + }) + } #' Write a table of 2 columns: 1) process and 2) median seconds @@ -113,51 +156,99 @@ get_proc_medians <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: write_proc_medians_table( +#' example: writeProcessRuntime2TSV( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -write_proc_medians_table <- function(dir_job_results, filepath) { - df_proc_medians <- get_proc_medians(dir_job_results) |> - tibble::as_tibble() |> - tidyr::pivot_longer( - dplyr::everything(), - names_to = "process", - values_to = "median_seconds" - ) |> - dplyr::arrange(dplyr::desc(median_seconds)) +writeProcessRuntime2TSV <- function(dir_job_results, filepath) { + tryCatch({ + # Error handling for input arguments + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } + + if (!is.character(filepath) || length(filepath) != 1) { + stop("Input 'filepath' must be a single character string.") + } + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + tibble::as_tibble() |> + tidyr::pivot_longer( + dplyr::everything(), + names_to = "process", + values_to = "median_seconds" + ) |> + dplyr::arrange(dplyr::desc(median_seconds)) + + # Write the resulting tibble to a TSV file readr::write_tsv(df_proc_medians, file = filepath) return(df_proc_medians) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("writeProcessRuntime2TSV function execution completed.") + }) + } #' Compute median process runtimes, then write a YAML list of the processes and #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which get_proc_weights() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory -#' @param filepath [chr] path to save YAML file; if NULL, uses ./molevol_scripts/log_data/job_proc_weights.yml +#' @param filepath [chr] path to save YAML file; if NULL, +#' uses ./molevol_scripts/log_data/job_proc_weights.yml #' #' @importFrom yaml write_yaml #' #' @examples #' \dontrun{ -#' write_proc_medians_yml( +#' writeProcessRuntimeToYML( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { +writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { + tryCatch({ + # Error handling for dir_job_results arguments + if (!is.character(dir_job_results) || length(dir_job_results) != 1) { + stop("Input 'dir_job_results' must be a single character string.") + } + + if (!dir.exists(dir_job_results)) { + stop(paste("The directory", dir_job_results, "does not exist.")) + } if (is.null(filepath)) { - filepath <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") + filepath <- file.path(common_root, + "molevol_scripts", + "log_data", + "job_proc_weights.yml") + } + if (!is.character(filepath) || length(filepath) != 1) { + stop("Input 'filepath' must be a single character string.") } - medians <- get_proc_medians(dir_job_results) + medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) + }, error = function(e) { + message(paste("Encountered an error: "), e$message) + }, warning = function(w) { + message(paste("Warning: "), w$message) + }, finally = { + message("write_proc_medians_table function execution completed.") + } + ) + } #' Quickly get the runtime weights for MolEvolvR backend processes @@ -170,50 +261,52 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: get_proc_weights() +#' example: writeProcessRuntimeToYML() #' @export -get_proc_weights <- function(medians_yml_path = NULL) { - if (is.null(medians_yml_path)) { - medians_yml_path <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") +getProcessRuntimeWeights <- function(medians_yml_path = NULL) { + if (is.null(medians_yml_path)) { + medians_yml_path <- file.path(common_root, + "molevol_scripts", + "log_data", + "job_proc_weights.yml") + } + + proc_weights <- tryCatch({ + # attempt to read the weights from the YAML file produced by + # writeProcessRuntimeToYML() + if (stringr::str_trim(medians_yml_path) == "") { + stop( + stringr::str_glue("medians_yml_path is empty + ({medians_yml_path}), returning default weights") + ) } - proc_weights <- tryCatch( - { - # attempt to read the weights from the YAML file produced by - # write_proc_medians_yml() - if (stringr::str_trim(medians_yml_path) == "") { - stop( - stringr::str_glue("medians_yml_path is empty ({medians_yml_path}), returning default weights") - ) - } - - proc_weights <- yaml::read_yaml(medians_yml_path) - }, - # to avoid fatal errors in reading the proc weights yaml, - # some median process runtimes have been hardcoded based on - # the result of get_proc_medians() from Jan 2024 - error = function(cond) { - proc_weights <- list( - "dblast" = 2810, - "iprscan" = 1016, - "dblast_cleanup" = 79, - "ipr2lineage" = 18, - "ipr2da" = 12, - "blast_clust" = 2, - "clust2table" = 2 - ) - proc_weights - } + proc_weights <- yaml::read_yaml(medians_yml_path) + }, + # to avoid fatal errors in reading the proc weights yaml, + # some median process runtimes have been hardcoded based on + # the result of calculateProcessRuntime() from Jan 2024 + error = function(cond) { + proc_weights <- list( + "dblast" = 2810, + "iprscan" = 1016, + "dblast_cleanup" = 79, + "ipr2lineage" = 18, + "ipr2da" = 12, + "blast_clust" = 2, + "clust2table" = 2 ) + proc_weights + }) - return(proc_weights) + return(proc_weights) } #' Given MolEvolvR advanced options and number of inputs, #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see make_opts2procs for the options) +#' (see mapOption2Process for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -221,68 +314,129 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: advanced_opts2est_walltime(c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) +#' example: calculateEstimatedWallTimeFromOptions(c("homology_search", +#' "domain_architecture"), +#' n_inputs = 3, n_hits = 50L) #' @export -advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { +calculateEstimatedWallTimeFromOptions <- function(advanced_opts, + n_inputs = 1L, + n_hits = NULL, + verbose = FALSE) { + + tryCatch({ # to calculate est walltime for a homology search job, the number of hits # must be provided validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts stopifnot(!validation_fail) - proc_weights <- get_proc_weights() + # Validate advanced_opts + if (!is.character(advanced_opts)) { + stop("Argument 'advanced_opts' must be a character vector.") + } + + # Validate n_inputs + if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { + stop("Argument 'n_inputs' must be a single positive numeric value.") + } + + # Validate n_hits if homology_search is in advanced_opts + if ("homology_search" %in% advanced_opts && + (is.null(n_hits)|| !is.numeric(n_hits) + || length(n_hits) != 1 || n_hits < 0)) { + stop("Argument 'n_hits' must be a single non-negative numeric value when + 'homology_search' is in 'advanced_opts'.") + } + + # Get process weights + proc_weights <- writeProcessRuntimeToYML() + if (!is.list(proc_weights)) { + stop("Process weights could not be retrieved correctly.") + } + # sort process weights by names and convert to vec proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- map_advanced_opts2procs(advanced_opts) + procs_from_opts <- mapAdvOption2Process(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) # dot product of weights and procs to run; scaled by the number of inputs est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> - as.numeric() + as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- make_opts2procs() - # exclude the homology search processes for the homologous hits - procs2exclude_for_homologs <- opts2procs[["homology_search"]] - procs_homologs <- procs_from_opts[!(procs_from_opts %in% procs2exclude_for_homologs)] - binary_proc_vec_homolog <- dplyr::if_else(all_procs %in% procs_homologs, 1L, 0L) - # add the estimated walltime for processes run on the homologous hits - est_walltime <- est_walltime + - (n_hits * (binary_proc_vec_homolog %*% proc_weights) |> as.numeric()) + opts2procs <- mapOption2Process() + # exclude the homology search processes for the homologous hits + procs2exclude_for_homologs <- opts2procs[["homology_search"]] + procs_homologs <- procs_from_opts[!(procs_from_opts + %in% procs2exclude_for_homologs)] + binary_proc_vec_homolog <- dplyr::if_else(all_procs + %in% procs_homologs, 1L, 0L) + # add the estimated walltime for processes run on the homologous hits + est_walltime <- est_walltime + + (n_hits * (binary_proc_vec_homolog + %*% proc_weights) |> as.numeric()) } if (verbose) { - msg <- stringr::str_glue( - "warnings from advanced_opts2est_walltime():\n", - "\tn_inputs={n_inputs}\n", - "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", - "\test_walltime={est_walltime}\n\n" - ) - cat(file = stderr(), msg) + msg <- stringr::str_glue( + "warnings from calculateEstimatedWallTimeFromOptions():\n", + "\tn_inputs={n_inputs}\n", + "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", + "\test_walltime={est_walltime}\n\n" + ) + cat(file = stderr(), msg) } return(est_walltime) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("calculateEstimatedWallTimeFromOptions + function execution completed.") + }) + } + #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from advanced_opts2est_walltime()) +#' (from calculateEstimatedWallTimeFromOptions()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' advanced_opts2est_walltime(c("homology_search", "domain_architecture"), 3) |> -#' assign_job_queue() +#' calculateEstimatedWallTimeFromOptions(c("homology_search", +#' "domain_architecture"), 3) |> +#' assignJobQueue() #' @export -assign_job_queue <- function( - t_sec_estimate, - t_cutoff = 21600 # 6 hours - ) { +assignJobQueue <- function( + t_sec_estimate, + t_cutoff = 21600 # 6 hours +) { + tryCatch({ + if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { + stop("Argument 't_sec_estimate' must be a single numeric value.") + } + + if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { + stop("Argument 't_cutoff' must be a single non-negative numeric value.") + } + queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") return(queue) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("assignJobQueue function execution completed.") + }) + } #' Plot the estimated runtimes for different advanced options and number @@ -297,81 +451,97 @@ assign_job_queue <- function( #' @return line plot object #' #' example: -#' p <- plot_estimated_walltimes() -#' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) +#' p <- plotEstimatedWallTimes() +#' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ +#' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plot_estimated_walltimes <- function() { - opts <- make_opts2procs() |> names() +plotEstimatedWallTimes <- function() { + tryCatch({ + opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { - # generate powerset (do not include empty set) - n <- length(vec) - indices <- 1:n - powerset <- lapply(1:n, function(x) combn(indices, x, simplify = FALSE)) - powerset <- unlist(powerset, recursive = FALSE) - powerset <- lapply(powerset, function(index) vec[index]) - powerset + # generate powerset (do not include empty set) + n <- length(vec) + indices <- 1:n + powerset <- lapply(1:n, function(x) combn(indices, x, simplify = FALSE)) + powerset <- unlist(powerset, recursive = FALSE) + powerset <- lapply(powerset, function(index) vec[index]) + powerset } opts_power_set <- get_powerset(opts) est_walltimes <- list() for (i in 1:20) { - est_walltimes <- append( - x = est_walltimes, - values = sapply( - opts_power_set, - FUN = function(advanced_opts) { - # for simplicity, assume the default number of homologus hits (100) - n_hits <- if ("homology_search" %in% advanced_opts) { - 100 - } else { - NULL - } - est_walltime <- advanced_opts2est_walltime( - advanced_opts, - n_inputs = i, - n_hits = n_hits, - verbose = TRUE - ) - names(est_walltime) <- paste0(advanced_opts, collapse = "_") - est_walltime - } + est_walltimes <- append( + x = est_walltimes, + values = sapply( + opts_power_set, + FUN = function(advanced_opts) { + # for simplicity, assume the default number of homologus hits (100) + n_hits <- if ("homology_search" %in% advanced_opts) { + 100 + } else { + NULL + } + est_walltime <- calculateEstimatedWallTimeFromOptions( + advanced_opts, + n_inputs = i, + n_hits = n_hits, + verbose = TRUE ) + names(est_walltime) <- paste0(advanced_opts, collapse = "_") + est_walltime + } ) + ) } # concat all results to their unique names est_walltimes <- tapply( - unlist( - est_walltimes, - use.names = FALSE - ), - rep( - names(est_walltimes), - lengths(est_walltimes) - ), - FUN = c + unlist( + est_walltimes, + use.names = FALSE + ), + rep( + names(est_walltimes), + lengths(est_walltimes) + ), + FUN = c ) df_walltimes <- est_walltimes |> - unlist() |> - matrix(nrow = length(est_walltimes[[1]]), ncol = length(names(est_walltimes))) + unlist() |> + matrix(nrow = length(est_walltimes[[1]]), + ncol = length(names(est_walltimes))) colnames(df_walltimes) <- names(est_walltimes) df_walltimes <- df_walltimes |> tibble::as_tibble() # rm always col or powerset outcome without the "always" processes col_idx_keep <- grep(pattern = "always$", x = names(df_walltimes)) df_walltimes <- df_walltimes |> - dplyr::select(col_idx_keep) + dplyr::select(col_idx_keep) # bind n_inputs df_walltimes <- df_walltimes |> - dplyr::mutate(n_inputs = 1:20) - df_walltimes <- tidyr::gather(df_walltimes, key = "advanced_opts", value = "est_walltime", -n_inputs) + dplyr::mutate(n_inputs = 1:20) + df_walltimes <- tidyr::gather(df_walltimes, + key = "advanced_opts", + value = "est_walltime", + n_inputs) # sec to hrs df_walltimes <- df_walltimes |> - dplyr::mutate(est_walltime = est_walltime / 3600) - p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, y = est_walltime, color = advanced_opts)) + - ggplot2::geom_line() + - ggplot2::labs( - title = "MolEvolvR estimated runtimes", - x = "Number of inputs", - y = "Estimated walltime (hours)" - ) + dplyr::mutate(est_walltime = est_walltime / 3600) + p <- ggplot2::ggplot(df_walltimes, ggplot2::aes(x = n_inputs, + y = est_walltime, + color = advanced_opts)) + + ggplot2::geom_line() + + ggplot2::labs( + title = "MolEvolvR estimated runtimes", + x = "Number of inputs", + y = "Estimated walltime (hours)" + ) return(p) + }, error = function(e) { + message(paste("Encountered an error: ", e$message)) + }, warning = function(w) { + message(paste("Warning: ", w$message)) + }, finally = { + message("plotEstimatedWallTimes function execution completed.") + }) + } diff --git a/R/clean_clust_file.R b/R/clean_clust_file.R index d3f813e5..87dcde70 100755 --- a/R/clean_clust_file.R +++ b/R/clean_clust_file.R @@ -55,9 +55,9 @@ #' #' @examples #' \dontrun{ -#' clean_clust_file("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") +#' cleanClusterFile("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") #' } -clean_clust_file <- function(path, writepath = NULL, query) { +cleanClusterFile <- function(path, writepath = NULL, query) { # ?? does the following line need to be changed to read_lines()? prot <- read_tsv(path, col_names = F) diff --git a/R/combine_analysis.R b/R/combine_analysis.R index bb3b3ce2..58ce1f14 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,7 +17,7 @@ #' @export #' #' @examples -combine_full <- function(inpath, ret = FALSE) { +combineFullAnalysis <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combine_files(inpath, pattern = "*.full_analysis.tsv", skip = 0, @@ -44,7 +44,7 @@ combine_full <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combine_ipr <- function(inpath, ret = FALSE) { +combineIPR <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combine_files(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, diff --git a/R/combine_files.R b/R/combine_files.R index 76c5fa09..455ddd53 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combine_files(inpath, +# full_combnd <- combineFiles(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combine_files(inpath, +# cln_combnd <- combineFiles(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combine_files(inpath, +# cl_blast_combnd <- combineFiles(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combine_files(inpath, +# ipr_combnd <- combineFiles(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index e7374df3..d911934a 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), +createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - shorten_NA <- function(Lineage) { + .shortenNA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, shorten_NA))) + mutate(Lineage = unlist(map(Lineage, .shortenNA))) diff --git a/man/assign_job_queue.Rd b/man/assignJobQueue.Rd similarity index 64% rename from man/assign_job_queue.Rd rename to man/assignJobQueue.Rd index ceb6fa77..27511b6a 100644 --- a/man/assign_job_queue.Rd +++ b/man/assignJobQueue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assign_job_queue} -\alias{assign_job_queue} +\name{assignJobQueue} +\alias{assignJobQueue} \title{Decision function to assign job queue} \usage{ -assign_job_queue(t_sec_estimate, t_cutoff = 21600) +assignJobQueue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from advanced_opts2est_walltime())} +(from calculateEstimatedWallTimeFromOptions())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,8 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -advanced_opts2est_walltime(c("homology_search", "domain_architecture"), 3) |> -assign_job_queue() +calculateEstimatedWallTimeFromOptions(c("homology_search", +"domain_architecture"), 3) |> +assignJobQueue() } \description{ Decision function to assign job queue diff --git a/man/advanced_opts2est_walltime.Rd b/man/calculateEstimatedWallTimeFromOptions.Rd similarity index 68% rename from man/advanced_opts2est_walltime.Rd rename to man/calculateEstimatedWallTimeFromOptions.Rd index ea4b29e6..e4eec3fd 100644 --- a/man/advanced_opts2est_walltime.Rd +++ b/man/calculateEstimatedWallTimeFromOptions.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{advanced_opts2est_walltime} -\alias{advanced_opts2est_walltime} +\name{calculateEstimatedWallTimeFromOptions} +\alias{calculateEstimatedWallTimeFromOptions} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -advanced_opts2est_walltime( +calculateEstimatedWallTimeFromOptions( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,16 @@ advanced_opts2est_walltime( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see make_opts2procs for the options)} +(see mapOption2Process for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: advanced_opts2est_walltime(c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) +example: calculateEstimatedWallTimeFromOptions(c("homology_search", +"domain_architecture"), +n_inputs = 3, n_hits = 50L) } \description{ Given MolEvolvR advanced options and number of inputs, diff --git a/man/get_proc_medians.Rd b/man/calculateProcessRuntime.Rd similarity index 76% rename from man/get_proc_medians.Rd rename to man/calculateProcessRuntime.Rd index b6db0b56..bb6dd1ed 100644 --- a/man/get_proc_medians.Rd +++ b/man/calculateProcessRuntime.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_medians} -\alias{get_proc_medians} +\name{calculateProcessRuntime} +\alias{calculateProcessRuntime} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -get_proc_medians(dir_job_results) +calculateProcessRuntime(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) } } \description{ diff --git a/man/clean_clust_file.Rd b/man/cleanClusterFile.Rd similarity index 82% rename from man/clean_clust_file.Rd rename to man/cleanClusterFile.Rd index bba3072e..d2818662 100644 --- a/man/clean_clust_file.Rd +++ b/man/cleanClusterFile.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/clean_clust_file.R -\name{clean_clust_file} -\alias{clean_clust_file} +\name{cleanClusterFile} +\alias{cleanClusterFile} \title{Clean Cluster File} \usage{ -clean_clust_file(path, writepath = NULL, query) +cleanClusterFile(path, writepath = NULL, query) } \arguments{ \item{path}{A character to the path of the cluster file to be cleaned} @@ -24,6 +24,6 @@ This function reads a space-separated cluster file and converts it to a cleaned } \examples{ \dontrun{ -clean_clust_file("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") +cleanClusterFile("data/pspa.op_ins_cls", writepath = NULL, query = "pspa") } } diff --git a/man/combine_files.Rd b/man/combineFiles.Rd similarity index 92% rename from man/combine_files.Rd rename to man/combineFiles.Rd index 4126eb9e..3b56b923 100644 --- a/man/combine_files.Rd +++ b/man/combineFiles.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combine_files} -\alias{combine_files} +\name{combineFiles} +\alias{combineFiles} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combine_files( +combineFiles( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combine_full.Rd b/man/combineFullAnalysis.Rd similarity index 69% rename from man/combine_full.Rd rename to man/combineFullAnalysis.Rd index f4e6597b..35925e86 100644 --- a/man/combine_full.Rd +++ b/man/combineFullAnalysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_full} -\alias{combine_full} +\name{combineFullAnalysis} +\alias{combineFullAnalysis} \title{Combining full_analysis files} \usage{ -combine_full(inpath, ret = FALSE) +combineFullAnalysis(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combine_ipr.Rd b/man/combineIPR.Rd similarity index 74% rename from man/combine_ipr.Rd rename to man/combineIPR.Rd index 52aa3057..035c4274 100644 --- a/man/combine_ipr.Rd +++ b/man/combineIPR.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_ipr} -\alias{combine_ipr} +\name{combineIPR} +\alias{combineIPR} \title{Combining clean ipr files} \usage{ -combine_ipr(inpath, ret = FALSE) +combineIPR(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/create_lineage_lookup.Rd b/man/createLineageLookup.Rd similarity index 91% rename from man/create_lineage_lookup.Rd rename to man/createLineageLookup.Rd index 51670f35..5dbab978 100644 --- a/man/create_lineage_lookup.Rd +++ b/man/createLineageLookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{create_lineage_lookup} -\alias{create_lineage_lookup} +\name{createLineageLookup} +\alias{createLineageLookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -create_lineage_lookup( +createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" diff --git a/man/get_proc_weights.Rd b/man/getProcessRuntimeWeights.Rd similarity index 73% rename from man/get_proc_weights.Rd rename to man/getProcessRuntimeWeights.Rd index 0f4beb57..8eff0347 100644 --- a/man/get_proc_weights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_weights} -\alias{get_proc_weights} +\name{getProcessRuntimeWeights} +\alias{getProcessRuntimeWeights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -get_proc_weights(medians_yml_path = NULL) +getProcessRuntimeWeights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: get_proc_weights() +example: writeProcessRuntimeToYML() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/map_advanced_opts2procs.Rd b/man/mapAdvOption2Process.Rd similarity index 76% rename from man/map_advanced_opts2procs.Rd rename to man/mapAdvOption2Process.Rd index 631708b4..5bd9ee65 100644 --- a/man/map_advanced_opts2procs.Rd +++ b/man/mapAdvOption2Process.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{map_advanced_opts2procs} -\alias{map_advanced_opts2procs} +\name{mapAdvOption2Process} +\alias{mapAdvOption2Process} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -map_advanced_opts2procs(advanced_opts) +mapAdvOption2Process(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- map_advanced_opts2procs(advanced_opts) +procs <- mapAdvOption2Process(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/make_opts2procs.Rd b/man/mapOption2Process.Rd similarity index 75% rename from man/make_opts2procs.Rd rename to man/mapOption2Process.Rd index 07e208b2..ff6905c5 100644 --- a/man/make_opts2procs.Rd +++ b/man/mapOption2Process.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{make_opts2procs} -\alias{make_opts2procs} +\name{mapOption2Process} +\alias{mapOption2Process} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -make_opts2procs() +mapOption2Process() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- make_opts2procs +example: list_opts2procs <- mapOption2Process } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/plot_estimated_walltimes.Rd b/man/plotEstimatedWallTimes.Rd similarity index 55% rename from man/plot_estimated_walltimes.Rd rename to man/plotEstimatedWallTimes.Rd index 3669e0e0..0d53cb32 100644 --- a/man/plot_estimated_walltimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -1,18 +1,19 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plot_estimated_walltimes} -\alias{plot_estimated_walltimes} +\name{plotEstimatedWallTimes} +\alias{plotEstimatedWallTimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plot_estimated_walltimes() +plotEstimatedWallTimes() } \value{ line plot object example: -p <- plot_estimated_walltimes() -ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) +p <- plotEstimatedWallTimes() +ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ +dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } \description{ this function was just for fun; very, very messy code diff --git a/man/write_proc_medians_table.Rd b/man/writeProcessRuntime2TSV.Rd similarity index 77% rename from man/write_proc_medians_table.Rd rename to man/writeProcessRuntime2TSV.Rd index 2ae7a97b..03cbbd68 100644 --- a/man/write_proc_medians_table.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_table} -\alias{write_proc_medians_table} +\name{writeProcessRuntime2TSV} +\alias{writeProcessRuntime2TSV} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -write_proc_medians_table(dir_job_results, filepath) +writeProcessRuntime2TSV(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ write_proc_medians_table(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: write_proc_medians_table( +example: writeProcessRuntime2TSV( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/write_proc_medians_yml.Rd b/man/writeProcessRuntimeToYML.Rd similarity index 61% rename from man/write_proc_medians_yml.Rd rename to man/writeProcessRuntimeToYML.Rd index a3d8ee5f..e4a5c8ad 100644 --- a/man/write_proc_medians_yml.Rd +++ b/man/writeProcessRuntimeToYML.Rd @@ -1,25 +1,26 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_yml} -\alias{write_proc_medians_yml} +\name{writeProcessRuntimeToYML} +\alias{writeProcessRuntimeToYML} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -write_proc_medians_yml(dir_job_results, filepath = NULL) +writeProcessRuntimeToYML(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} -\item{filepath}{\link{chr} path to save YAML file; if NULL, uses ./molevol_scripts/log_data/job_proc_weights.yml} +\item{filepath}{\link{chr} path to save YAML file; if NULL, +uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which get_proc_weights() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default read location. } \examples{ \dontrun{ -write_proc_medians_yml( +writeProcessRuntimeToYML( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From 091d32ebb31b6f295268b4e0a38ef0fab1066358 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Tue, 8 Oct 2024 07:17:56 +0100 Subject: [PATCH 03/19] fixing merge issue in NAMESPACE --- NAMESPACE | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 739c76d7..d2ef5463 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,9 @@ export(cleanSpecies) export(combineFiles) export(combineFullAnalysis) export(combineIPR) +export(condenseRepeatedDomains) +export(convert2TitleCase) +export(convertAlignment2FA) export(convert_aln2fa) export(convert_fa2tre) export(count_bycol) @@ -63,13 +66,15 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) +export(mapAcc2Name) export(mapAdvOption2Process) export(mapOption2Process) -export(mapAcc2Name) +export(map_acc2name) export(msa_pdf) export(pick_longer_duplicate) export(plotEstimatedWallTimes) export(prot2tax) +export(prot2tax_old) export(removeAsterisks) export(removeEmptyRows) export(removeTails) From fc63187c4985d8a9fad15582691b4ee4f9c273e6 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Tue, 8 Oct 2024 08:18:42 +0100 Subject: [PATCH 04/19] Added updated function name to NAMESPACE and removed unused argument in readAAStringSet --- NAMESPACE | 3 +-- R/msa.R | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d2ef5463..cd135cc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,9 +20,9 @@ export(assert_count_df) export(assignJobQueue) export(calculateEstimatedWallTimeFromOptions) export(calculateProcessRuntime) -export(cleanGeneDescription) export(cleanClusters) export(cleanDomainArchitecture) +export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) @@ -71,7 +71,6 @@ export(mapAdvOption2Process) export(mapOption2Process) export(map_acc2name) export(msa_pdf) -export(pick_longer_duplicate) export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) diff --git a/R/msa.R b/R/msa.R index e56cc32c..0b1b6e34 100644 --- a/R/msa.R +++ b/R/msa.R @@ -197,21 +197,21 @@ msa_pdf <- function(fasta_path, out_path = NULL, #' #' @examples generate_msa <- function(fa_file = "", outfile = "") { - prot_aa <- readAAStringSet( - path = fa_file, - format = "fasta" - ) - prot_aa + prot_aa <- readAAStringSet( + fa_file, + format = "fasta" + ) + prot_aa - ## Install kalign ?rMSA_INSTALL - ## Messed up! Reimplement from kalign.R - ## https://github.com/mhahsler/rMSA/blob/master/R/kalign.R + ## Install kalign ?rMSA_INSTALL + ## Messed up! Reimplement from kalign.R + ## https://github.com/mhahsler/rMSA/blob/master/R/kalign.R - # source("scripts/c2r.R") + # source("scripts/c2r.R") - ## align the sequences - al <- kalign(prot_aa) # !! won't work! - al + ## align the sequences + al <- kalign(prot_aa) # !! won't work! + al } ############################ From 38f3cb000ddf35028c1e7c940920dd051db1a2dc Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Wed, 9 Oct 2024 11:32:03 +0100 Subject: [PATCH 05/19] added error handling functionality for the run_deltablast and run_rpsblast functions. This includes arguments check before wrapping code logic in a tryCatch block. --- R/blastWrappers.R | 109 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 25 deletions(-) diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 552b1ff6..15484a1b 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -18,25 +18,56 @@ #' #' @examples run_deltablast <- function(deltablast_path, db_search_path, - db = "refseq", query, evalue = "1e-5", - out, num_alignments, num_threads = 1) { - start <- Sys.time() + db = "refseq", query, evalue = "1e-5", + out, num_alignments, num_threads = 1) { + # Argument validation + if (!file.exists(deltablast_path)) { + stop("The DELTABLAST executable path is invalid: ", deltablast_path) + } + if (!dir.exists(db_search_path)) { + stop("The database search path is invalid: ", db_search_path) + } + if (!file.exists(query)) { + stop("The query file path is invalid: ", query) + } + if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { + stop("The evalue must be a positive number: ", evalue) + } + if (!is.numeric(num_alignments) || num_alignments <= 0) { + stop("The number of alignments must be a + positive integer: ", num_alignments) + } + if (!is.numeric(num_threads) || num_threads <= 0) { + stop("The number of threads must be a positive integer: ", num_threads) + } + + start <- Sys.time() + + tryCatch({ system(paste0("export BLASTDB=/", db_search_path)) system2( - command = deltablast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads, - "-num_alignments", num_alignments - # ,"-outfmt", outfmt - ) + command = deltablast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads, + "-num_alignments", num_alignments + # ,"-outfmt", outfmt + ) ) print(Sys.time() - start) + }, error = function(e) { + message(paste("Error in run_deltablast: ", e)) + }, warning = function(w) { + message(paste("Warning in run_deltablast: ", w)) + }, finally = { + message("run_deltablast completed") + }) + } @@ -55,20 +86,48 @@ run_deltablast <- function(deltablast_path, db_search_path, #' #' @examples run_rpsblast <- function(rpsblast_path, db_search_path, - db = "refseq", query, evalue = "1e-5", - out, num_threads = 1) { - start <- Sys.time() + db = "refseq", query, evalue = "1e-5", + out, num_threads = 1) { + # Argument validation + if (!file.exists(rpsblast_path)) { + stop("The RPSBLAST executable path is invalid: ", rpsblast_path) + } + if (!dir.exists(db_search_path)) { + stop("The database search path is invalid: ", db_search_path) + } + if (!file.exists(query)) { + stop("The query file path is invalid: ", query) + } + if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { + stop("The evalue must be a positive number: ", evalue) + } + if (!is.numeric(num_threads) || num_threads <= 0) { + stop("The number of threads must be a positive integer: ", num_threads) + } + + start <- Sys.time() + + tryCatch({ + system(paste0("export BLASTDB=/", db_search_path)) + system2( - command = rpsblast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads - # , "-outfmt", outfmt - ) + command = rpsblast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads + ) ) print(Sys.time() - start) + }, error = function(e) { + message(paste("Error in run_rpsblast: ", e)) + }, warning = function(w) { + message(paste("Warning in run_rpsblast: ", w)) + }, finally = { + message("run_rpsblast completed") + }) + } From 4ff68fb06395842093879dea47e45aaae1967225 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 08:27:02 +0100 Subject: [PATCH 06/19] Reverting to old function names for the following functions to create a separate pr for their updates and on a different branch: R/combine_analysis.R combine_full combine_ipr R/combine_files.R combine_files R/create_lineage_lookup.R create_lineage_lookup shorten_NA --- R/combine_analysis.R | 4 ++-- R/combine_files.R | 10 +++++----- R/create_lineage_lookup.R | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/combine_analysis.R b/R/combine_analysis.R index 58ce1f14..bb3b3ce2 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,7 +17,7 @@ #' @export #' #' @examples -combineFullAnalysis <- function(inpath, ret = FALSE) { +combine_full <- function(inpath, ret = FALSE) { ## Combining full_analysis files full_combnd <- combine_files(inpath, pattern = "*.full_analysis.tsv", skip = 0, @@ -44,7 +44,7 @@ combineFullAnalysis <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combineIPR <- function(inpath, ret = FALSE) { +combine_ipr <- function(inpath, ret = FALSE) { ## Combining clean ipr files ipr_combnd <- combine_files(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, diff --git a/R/combine_files.R b/R/combine_files.R index 455ddd53..76c5fa09 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combineFiles(inpath, +# full_combnd <- combine_files(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combineFiles(inpath, +# cln_combnd <- combine_files(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/ # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combineFiles(inpath, +# cl_blast_combnd <- combine_files(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combineFiles(inpath, +# ipr_combnd <- combine_files(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index d911934a..8e365cbb 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), +create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - .shortenNA <- function(Lineage) { + shorten_NA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, .shortenNA))) + mutate(Lineage = unlist(map(Lineage, shorten_NA))) @@ -101,7 +101,7 @@ createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), -#' CreateLineageLookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") +#' create_lineage_lookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") #' { #' #' Create a look up table that goes from GCA_ID, to TaxID, to Lineage #' #' @author Samuel Chen From 035c5e13b4cfe54b4ba7ff1d5c7618ade13720d1 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 08:41:47 +0100 Subject: [PATCH 07/19] minor updates to namespace and Rd files after running devtool::check() --- NAMESPACE | 8 ++++---- man/{combineFiles.Rd => combine_files.Rd} | 6 +++--- man/{combineFullAnalysis.Rd => combine_full.Rd} | 6 +++--- man/{combineIPR.Rd => combine_ipr.Rd} | 6 +++--- man/{createLineageLookup.Rd => create_lineage_lookup.Rd} | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) rename man/{combineFiles.Rd => combine_files.Rd} (92%) rename man/{combineFullAnalysis.Rd => combine_full.Rd} (69%) rename man/{combineIPR.Rd => combine_ipr.Rd} (74%) rename man/{createLineageLookup.Rd => create_lineage_lookup.Rd} (91%) diff --git a/NAMESPACE b/NAMESPACE index cd135cc8..f49975b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,9 +26,9 @@ export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) -export(combineFiles) -export(combineFullAnalysis) -export(combineIPR) +export(combine_files) +export(combine_full) +export(combine_ipr) export(condenseRepeatedDomains) export(convert2TitleCase) export(convertAlignment2FA) @@ -37,8 +37,8 @@ export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) -export(createLineageLookup) export(create_all_col_params) +export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) diff --git a/man/combineFiles.Rd b/man/combine_files.Rd similarity index 92% rename from man/combineFiles.Rd rename to man/combine_files.Rd index 3b56b923..4126eb9e 100644 --- a/man/combineFiles.Rd +++ b/man/combine_files.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combineFiles} -\alias{combineFiles} +\name{combine_files} +\alias{combine_files} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combineFiles( +combine_files( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combineFullAnalysis.Rd b/man/combine_full.Rd similarity index 69% rename from man/combineFullAnalysis.Rd rename to man/combine_full.Rd index 35925e86..f4e6597b 100644 --- a/man/combineFullAnalysis.Rd +++ b/man/combine_full.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combineFullAnalysis} -\alias{combineFullAnalysis} +\name{combine_full} +\alias{combine_full} \title{Combining full_analysis files} \usage{ -combineFullAnalysis(inpath, ret = FALSE) +combine_full(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combineIPR.Rd b/man/combine_ipr.Rd similarity index 74% rename from man/combineIPR.Rd rename to man/combine_ipr.Rd index 035c4274..52aa3057 100644 --- a/man/combineIPR.Rd +++ b/man/combine_ipr.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combineIPR} -\alias{combineIPR} +\name{combine_ipr} +\alias{combine_ipr} \title{Combining clean ipr files} \usage{ -combineIPR(inpath, ret = FALSE) +combine_ipr(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/createLineageLookup.Rd b/man/create_lineage_lookup.Rd similarity index 91% rename from man/createLineageLookup.Rd rename to man/create_lineage_lookup.Rd index 5dbab978..51670f35 100644 --- a/man/createLineageLookup.Rd +++ b/man/create_lineage_lookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{createLineageLookup} -\alias{createLineageLookup} +\name{create_lineage_lookup} +\alias{create_lineage_lookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -createLineageLookup( +create_lineage_lookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" From fb5ac23f8a3e8e5709498aa24308a950802d1c29 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 09:20:22 +0100 Subject: [PATCH 08/19] Renamed the following function; R/combine_analysis.R combine_full combine_ipr R/combine_files.R combine_files R/create_lineage_lookup.R create_lineage_lookup shorten_NA with approved names from #44 --- NAMESPACE | 8 ++++---- R/acc2lin.R | 2 +- R/combine_analysis.R | 8 ++++---- R/combine_files.R | 10 +++++----- R/create_lineage_lookup.R | 8 ++++---- R/lineage.R | 4 ++-- man/GCA2lin.Rd | 2 +- man/{combine_files.Rd => combineFiles.Rd} | 6 +++--- man/{combine_full.Rd => combineFullAnalysis.Rd} | 6 +++--- man/{combine_ipr.Rd => combineIPR.Rd} | 6 +++--- ...create_lineage_lookup.Rd => createLineageLookup.Rd} | 6 +++--- man/ipg2lin.Rd | 2 +- 12 files changed, 34 insertions(+), 34 deletions(-) rename man/{combine_files.Rd => combineFiles.Rd} (92%) rename man/{combine_full.Rd => combineFullAnalysis.Rd} (69%) rename man/{combine_ipr.Rd => combineIPR.Rd} (74%) rename man/{create_lineage_lookup.Rd => createLineageLookup.Rd} (91%) diff --git a/NAMESPACE b/NAMESPACE index f49975b4..cd135cc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,9 +26,9 @@ export(cleanGeneDescription) export(cleanGenomicContext) export(cleanLineage) export(cleanSpecies) -export(combine_files) -export(combine_full) -export(combine_ipr) +export(combineFiles) +export(combineFullAnalysis) +export(combineIPR) export(condenseRepeatedDomains) export(convert2TitleCase) export(convertAlignment2FA) @@ -37,8 +37,8 @@ export(convert_fa2tre) export(count_bycol) export(count_to_sunburst) export(count_to_treemap) +export(createLineageLookup) export(create_all_col_params) -export(create_lineage_lookup) export(create_one_col_params) export(domain_network) export(efetch_ipg) diff --git a/R/acc2lin.R b/R/acc2lin.R index dfb33da9..a6551247 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -277,7 +277,7 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") { #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' #' @importFrom data.table fread #' diff --git a/R/combine_analysis.R b/R/combine_analysis.R index bb3b3ce2..55e36925 100755 --- a/R/combine_analysis.R +++ b/R/combine_analysis.R @@ -17,9 +17,9 @@ #' @export #' #' @examples -combine_full <- function(inpath, ret = FALSE) { +combineFullAnalysis <- function(inpath, ret = FALSE) { ## Combining full_analysis files - full_combnd <- combine_files(inpath, + full_combnd <- combineFiles(inpath, pattern = "*.full_analysis.tsv", skip = 0, col_names = T ) @@ -44,9 +44,9 @@ combine_full <- function(inpath, ret = FALSE) { #' @export #' #' @examples -combine_ipr <- function(inpath, ret = FALSE) { +combineIPR <- function(inpath, ret = FALSE) { ## Combining clean ipr files - ipr_combnd <- combine_files(inpath, + ipr_combnd <- combineFiles(inpath, pattern = "*.iprscan_cln.tsv", skip = 0, col_names = T ) diff --git a/R/combine_files.R b/R/combine_files.R index 76c5fa09..455ddd53 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -38,7 +38,7 @@ #' @export #' #' @examples -combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense/"), +combineFiles <- function(inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\t", skip = 0, col_names = T) { @@ -67,7 +67,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense ## Sample Runs ## ################# # ## Combining full_analysis files -# full_combnd <- combine_files(inpath, +# full_combnd <- combineFiles(inpath, # pattern="*full_analysis.txt", skip=0, # col_names=T) # @@ -75,7 +75,7 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # path="../molevol_data/project_data/slps/full_combined.tsv") # # ## Combining clean files -# cln_combnd <- combine_files(inpath, +# cln_combnd <- combineFiles(inpath, # pattern="^.*cln.txt", skip=0, # col_names=T) # @@ -86,14 +86,14 @@ combine_files <- function(inpath = c("../molevol_data/project_data/phage_defense # ## Less helpful examples! # ## Combining BLAST files # ## Likely makes no sense since clustering is done per query -# cl_blast_combnd <- combine_files(inpath, +# cl_blast_combnd <- combineFiles(inpath, # pattern="^.*refseq.1e-5.txt", skip=0, # col_names=cl_blast_colnames) %>% # select(-PcPositive, -ClusterID) # # ## Combining IPR files # ## Likely makes no sense since there may be repeated AccNum from indiv. files! -# ipr_combnd <- combine_files(inpath, +# ipr_combnd <- combineFiles(inpath, # pattern="*iprscan.lins*", skip=0, # col_names=ipr_colnames) # diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index 8e365cbb..78e79048 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -26,9 +26,9 @@ #' @export #' #' @examples -create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), +createLineageLookup <- function(lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum") { - shorten_NA <- function(Lineage) { + .shortenNA <- function(Lineage) { first_NA <- str_locate(Lineage, "NA")[1] if (is.na(first_NA)) { # No NAs @@ -92,7 +92,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), # Takes a while (2million rows after all) rankedLinsCombined <- rankedLins %>% unite(col = "Lineage", all_of(combined_taxonomy), sep = ">") %>% - mutate(Lineage = unlist(map(Lineage, shorten_NA))) + mutate(Lineage = unlist(map(Lineage, .shortenNA))) @@ -101,7 +101,7 @@ create_lineage_lookup <- function(lineage_file = here("data/rankedlineage.dmp"), -#' create_lineage_lookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") +#' createLineageLookup <- function(assembly_path, updateAssembly = FALSE, file_type = "tsv") #' { #' #' Create a look up table that goes from GCA_ID, to TaxID, to Lineage #' #' @author Samuel Chen diff --git a/R/lineage.R b/R/lineage.R index 20acec04..7ceed847 100644 --- a/R/lineage.R +++ b/R/lineage.R @@ -77,7 +77,7 @@ DownloadAssemblySummary <- function(outpath, #' This file can be generated using the "DownloadAssemblySummary()" function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' @param acc_col #' #' @importFrom dplyr pull @@ -309,7 +309,7 @@ efetch_ipg <- function(accessions, out_path, plan = "multicore") { #' @param genbank_assembly_path #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' "create_lineage_lookup()" function +#' "createLineageLookup()" function #' #' @importFrom data.table fread setnames #' diff --git a/man/GCA2lin.Rd b/man/GCA2lin.Rd index ad83ca39..47acc3d7 100644 --- a/man/GCA2lin.Rd +++ b/man/GCA2lin.Rd @@ -19,7 +19,7 @@ This file can be generated using the "DownloadAssemblySummary()" function} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{acc_col}{} } diff --git a/man/combine_files.Rd b/man/combineFiles.Rd similarity index 92% rename from man/combine_files.Rd rename to man/combineFiles.Rd index 4126eb9e..3b56b923 100644 --- a/man/combine_files.Rd +++ b/man/combineFiles.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_files.R -\name{combine_files} -\alias{combine_files} +\name{combineFiles} +\alias{combineFiles} \title{Download the combined assembly summaries of genbank and refseq} \usage{ -combine_files( +combineFiles( inpath = c("../molevol_data/project_data/phage_defense/"), pattern = "*full_analysis.tsv", delim = "\\t", diff --git a/man/combine_full.Rd b/man/combineFullAnalysis.Rd similarity index 69% rename from man/combine_full.Rd rename to man/combineFullAnalysis.Rd index f4e6597b..35925e86 100644 --- a/man/combine_full.Rd +++ b/man/combineFullAnalysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_full} -\alias{combine_full} +\name{combineFullAnalysis} +\alias{combineFullAnalysis} \title{Combining full_analysis files} \usage{ -combine_full(inpath, ret = FALSE) +combineFullAnalysis(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/combine_ipr.Rd b/man/combineIPR.Rd similarity index 74% rename from man/combine_ipr.Rd rename to man/combineIPR.Rd index 52aa3057..035c4274 100644 --- a/man/combine_ipr.Rd +++ b/man/combineIPR.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/combine_analysis.R -\name{combine_ipr} -\alias{combine_ipr} +\name{combineIPR} +\alias{combineIPR} \title{Combining clean ipr files} \usage{ -combine_ipr(inpath, ret = FALSE) +combineIPR(inpath, ret = FALSE) } \arguments{ \item{ret}{} diff --git a/man/create_lineage_lookup.Rd b/man/createLineageLookup.Rd similarity index 91% rename from man/create_lineage_lookup.Rd rename to man/createLineageLookup.Rd index 51670f35..5dbab978 100644 --- a/man/create_lineage_lookup.Rd +++ b/man/createLineageLookup.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_lineage_lookup.R -\name{create_lineage_lookup} -\alias{create_lineage_lookup} +\name{createLineageLookup} +\alias{createLineageLookup} \title{Create a look up table that goes from TaxID, to Lineage} \usage{ -create_lineage_lookup( +createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), outfile, taxonomic_rank = "phylum" diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd index 453668b0..5850e86c 100644 --- a/man/ipg2lin.Rd +++ b/man/ipg2lin.Rd @@ -29,7 +29,7 @@ file} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{assembly_path}{String of the path to the assembly_summary path This file can be generated using the "DownloadAssemblySummary()" function} From 106eb14b4e2eace66737a07cf5840011e490d116 Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 10:24:49 +0100 Subject: [PATCH 09/19] reverting to old function names; make_opts2procs, map_advanced_opts2procs, get_proc_medians, write_proc_medians_table, write_proc_medians_yml, get_proc_weights, advanced_opts2est_walltime in R/assign_job_queue.R to be updated in a separate full request --- NAMESPACE | 18 ++-- R/assign_job_queue.R | 84 +++++++++---------- ...tions.Rd => advanced_opts2est_walltime.Rd} | 10 +-- ...{assignJobQueue.Rd => assign_job_queue.Rd} | 12 +-- ...eProcessRuntime.Rd => get_proc_medians.Rd} | 10 +-- ...sRuntimeWeights.Rd => get_proc_weights.Rd} | 8 +- ...apOption2Process.Rd => make_opts2procs.Rd} | 8 +- ...2Process.Rd => map_advanced_opts2procs.Rd} | 8 +- ...llTimes.Rd => plot_estimated_walltimes.Rd} | 8 +- ...ime2TSV.Rd => write_proc_medians_table.Rd} | 8 +- ...timeToYML.Rd => write_proc_medians_yml.Rd} | 10 +-- 11 files changed, 92 insertions(+), 92 deletions(-) rename man/{calculateEstimatedWallTimeFromOptions.Rd => advanced_opts2est_walltime.Rd} (73%) rename man/{assignJobQueue.Rd => assign_job_queue.Rd} (68%) rename man/{calculateProcessRuntime.Rd => get_proc_medians.Rd} (76%) rename man/{getProcessRuntimeWeights.Rd => get_proc_weights.Rd} (73%) rename man/{mapOption2Process.Rd => make_opts2procs.Rd} (75%) rename man/{mapAdvOption2Process.Rd => map_advanced_opts2procs.Rd} (76%) rename man/{plotEstimatedWallTimes.Rd => plot_estimated_walltimes.Rd} (77%) rename man/{writeProcessRuntime2TSV.Rd => write_proc_medians_table.Rd} (77%) rename man/{writeProcessRuntimeToYML.Rd => write_proc_medians_yml.Rd} (74%) diff --git a/NAMESPACE b/NAMESPACE index f49975b4..b4be51ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,11 +15,10 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) +export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assignJobQueue) -export(calculateEstimatedWallTimeFromOptions) -export(calculateProcessRuntime) +export(assign_job_queue) export(cleanClusters) export(cleanDomainArchitecture) export(cleanGeneDescription) @@ -54,9 +53,10 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) -export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) +export(get_proc_medians) +export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -66,12 +66,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) +export(make_opts2procs) export(mapAcc2Name) -export(mapAdvOption2Process) -export(mapOption2Process) export(map_acc2name) +export(map_advanced_opts2procs) export(msa_pdf) -export(plotEstimatedWallTimes) +export(plot_estimated_walltimes) export(prot2tax) export(prot2tax_old) export(removeAsterisks) @@ -103,8 +103,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(writeProcessRuntime2TSV) -export(writeProcessRuntimeToYML) +export(write_proc_medians_table) +export(write_proc_medians_yml) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index f1fcb6db..c531fb09 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,16 +3,16 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") +# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- mapOption2Process +#' example: list_opts2procs <- make_opts2procs #' @export -mapOption2Process <- function() { +make_opts2procs <- function() { tryCatch({ opts2processes <- list( "homology_search" = c("dblast", "dblast_cleanup"), @@ -26,7 +26,7 @@ mapOption2Process <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("mapOption2Process function execution completed.") + message("make_opts2procs function execution completed.") }) } @@ -40,16 +40,16 @@ mapOption2Process <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- mapAdvOption2Process(advanced_opts) +#' procs <- map_advanced_opts2procs(advanced_opts) #' @export -mapAdvOption2Process <- function(advanced_opts) { +map_advanced_opts2procs <- function(advanced_opts) { if (!is.character(advanced_opts)) { stop("Argument must be a character vector!") } tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- mapOption2Process() + opts2proc <- make_opts2procs() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run @@ -60,7 +60,7 @@ mapAdvOption2Process <- function(advanced_opts) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("mapOption2Process function execution completed.") + message("make_opts2procs function execution completed.") }) } @@ -80,14 +80,14 @@ mapAdvOption2Process <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- calculateProcessRuntime(dir_job_results) +#' list_proc_medians <- get_proc_medians(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- calculateProcessRuntime(dir_job_results) +#' list_proc_medians <- get_proc_medians(dir_job_results) #' @export -calculateProcessRuntime <- function(dir_job_results) { +get_proc_medians <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -139,7 +139,7 @@ calculateProcessRuntime <- function(dir_job_results) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("calculateProcessRuntime function execution completed.") + message("get_proc_medians function execution completed.") }) } @@ -156,12 +156,12 @@ calculateProcessRuntime <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: writeProcessRuntime2TSV( +#' example: write_proc_medians_table( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -writeProcessRuntime2TSV <- function(dir_job_results, filepath) { +write_proc_medians_table <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -175,7 +175,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { if (!is.character(filepath) || length(filepath) != 1) { stop("Input 'filepath' must be a single character string.") } - df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + df_proc_medians <- get_proc_medians(dir_job_results) |> tibble::as_tibble() |> tidyr::pivot_longer( dplyr::everything(), @@ -192,7 +192,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("writeProcessRuntime2TSV function execution completed.") + message("write_proc_medians_table function execution completed.") }) } @@ -201,7 +201,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -212,13 +212,13 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' #' @examples #' \dontrun{ -#' writeProcessRuntimeToYML( +#' write_proc_medians_yml( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { +write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -238,7 +238,7 @@ writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { stop("Input 'filepath' must be a single character string.") } - medians <- calculateProcessRuntime(dir_job_results) + medians <- get_proc_medians(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { message(paste("Encountered an error: "), e$message) @@ -261,9 +261,9 @@ writeProcessRuntimeToYML <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: writeProcessRuntimeToYML() +#' example: write_proc_medians_yml() #' @export -getProcessRuntimeWeights <- function(medians_yml_path = NULL) { +get_proc_weights <- function(medians_yml_path = NULL) { if (is.null(medians_yml_path)) { medians_yml_path <- file.path(common_root, "molevol_scripts", @@ -273,7 +273,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { proc_weights <- tryCatch({ # attempt to read the weights from the YAML file produced by - # writeProcessRuntimeToYML() + # write_proc_medians_yml() if (stringr::str_trim(medians_yml_path) == "") { stop( stringr::str_glue("medians_yml_path is empty @@ -285,7 +285,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { }, # to avoid fatal errors in reading the proc weights yaml, # some median process runtimes have been hardcoded based on - # the result of calculateProcessRuntime() from Jan 2024 + # the result of get_proc_medians() from Jan 2024 error = function(cond) { proc_weights <- list( "dblast" = 2810, @@ -306,7 +306,7 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see mapOption2Process for the options) +#' (see make_opts2procs for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -314,11 +314,11 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: calculateEstimatedWallTimeFromOptions(c("homology_search", +#' example: advanced_opts2est_walltime (c("homology_search", #' "domain_architecture"), #' n_inputs = 3, n_hits = 50L) #' @export -calculateEstimatedWallTimeFromOptions <- function(advanced_opts, +advanced_opts2est_walltime <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { @@ -348,7 +348,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, } # Get process weights - proc_weights <- writeProcessRuntimeToYML() + proc_weights <- write_proc_medians_yml() if (!is.list(proc_weights)) { stop("Process weights could not be retrieved correctly.") } @@ -357,7 +357,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- mapAdvOption2Process(advanced_opts) + procs_from_opts <- map_advanced_opts2procs(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) @@ -366,7 +366,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- mapOption2Process() + opts2procs <- make_opts2procs() # exclude the homology search processes for the homologous hits procs2exclude_for_homologs <- opts2procs[["homology_search"]] procs_homologs <- procs_from_opts[!(procs_from_opts @@ -380,7 +380,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, } if (verbose) { msg <- stringr::str_glue( - "warnings from calculateEstimatedWallTimeFromOptions():\n", + "warnings from advanced_opts2est_walltime ():\n", "\tn_inputs={n_inputs}\n", "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", "\test_walltime={est_walltime}\n\n" @@ -393,7 +393,7 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("calculateEstimatedWallTimeFromOptions + message("advanced_opts2est_walltime function execution completed.") }) @@ -403,18 +403,18 @@ calculateEstimatedWallTimeFromOptions <- function(advanced_opts, #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from calculateEstimatedWallTimeFromOptions()) +#' (from advanced_opts2est_walltime ()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' calculateEstimatedWallTimeFromOptions(c("homology_search", +#' advanced_opts2est_walltime (c("homology_search", #' "domain_architecture"), 3) |> -#' assignJobQueue() +#' assign_job_queue() #' @export -assignJobQueue <- function( +assign_job_queue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { @@ -434,7 +434,7 @@ assignJobQueue <- function( }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("assignJobQueue function execution completed.") + message("assign_job_queue function execution completed.") }) } @@ -451,13 +451,13 @@ assignJobQueue <- function( #' @return line plot object #' #' example: -#' p <- plotEstimatedWallTimes() +#' p <- plot_estimated_walltimes() #' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plotEstimatedWallTimes <- function() { +plot_estimated_walltimes <- function() { tryCatch({ - opts <- mapOption2Process() |> names() + opts <- make_opts2procs() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { # generate powerset (do not include empty set) @@ -482,7 +482,7 @@ plotEstimatedWallTimes <- function() { } else { NULL } - est_walltime <- calculateEstimatedWallTimeFromOptions( + est_walltime <- advanced_opts2est_walltime ( advanced_opts, n_inputs = i, n_hits = n_hits, @@ -541,7 +541,7 @@ plotEstimatedWallTimes <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("plotEstimatedWallTimes function execution completed.") + message("plot_estimated_walltimes function execution completed.") }) } diff --git a/man/calculateEstimatedWallTimeFromOptions.Rd b/man/advanced_opts2est_walltime.Rd similarity index 73% rename from man/calculateEstimatedWallTimeFromOptions.Rd rename to man/advanced_opts2est_walltime.Rd index e4eec3fd..02ae9621 100644 --- a/man/calculateEstimatedWallTimeFromOptions.Rd +++ b/man/advanced_opts2est_walltime.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{calculateEstimatedWallTimeFromOptions} -\alias{calculateEstimatedWallTimeFromOptions} +\name{advanced_opts2est_walltime} +\alias{advanced_opts2est_walltime} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -calculateEstimatedWallTimeFromOptions( +advanced_opts2est_walltime( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,14 @@ calculateEstimatedWallTimeFromOptions( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see mapOption2Process for the options)} +(see make_opts2procs for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: calculateEstimatedWallTimeFromOptions(c("homology_search", +example: advanced_opts2est_walltime (c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) } diff --git a/man/assignJobQueue.Rd b/man/assign_job_queue.Rd similarity index 68% rename from man/assignJobQueue.Rd rename to man/assign_job_queue.Rd index 27511b6a..d2650fed 100644 --- a/man/assignJobQueue.Rd +++ b/man/assign_job_queue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assignJobQueue} -\alias{assignJobQueue} +\name{assign_job_queue} +\alias{assign_job_queue} \title{Decision function to assign job queue} \usage{ -assignJobQueue(t_sec_estimate, t_cutoff = 21600) +assign_job_queue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from calculateEstimatedWallTimeFromOptions())} +(from advanced_opts2est_walltime ())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,9 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -calculateEstimatedWallTimeFromOptions(c("homology_search", +advanced_opts2est_walltime (c("homology_search", "domain_architecture"), 3) |> -assignJobQueue() +assign_job_queue() } \description{ Decision function to assign job queue diff --git a/man/calculateProcessRuntime.Rd b/man/get_proc_medians.Rd similarity index 76% rename from man/calculateProcessRuntime.Rd rename to man/get_proc_medians.Rd index bb6dd1ed..b6db0b56 100644 --- a/man/calculateProcessRuntime.Rd +++ b/man/get_proc_medians.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{calculateProcessRuntime} -\alias{calculateProcessRuntime} +\name{get_proc_medians} +\alias{get_proc_medians} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -calculateProcessRuntime(dir_job_results) +get_proc_medians(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- calculateProcessRuntime(dir_job_results) +list_proc_medians <- get_proc_medians(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- calculateProcessRuntime(dir_job_results) +list_proc_medians <- get_proc_medians(dir_job_results) } } \description{ diff --git a/man/getProcessRuntimeWeights.Rd b/man/get_proc_weights.Rd similarity index 73% rename from man/getProcessRuntimeWeights.Rd rename to man/get_proc_weights.Rd index 8eff0347..f48585cc 100644 --- a/man/getProcessRuntimeWeights.Rd +++ b/man/get_proc_weights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{getProcessRuntimeWeights} -\alias{getProcessRuntimeWeights} +\name{get_proc_weights} +\alias{get_proc_weights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -getProcessRuntimeWeights(medians_yml_path = NULL) +get_proc_weights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: writeProcessRuntimeToYML() +example: write_proc_medians_yml() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/mapOption2Process.Rd b/man/make_opts2procs.Rd similarity index 75% rename from man/mapOption2Process.Rd rename to man/make_opts2procs.Rd index ff6905c5..07e208b2 100644 --- a/man/mapOption2Process.Rd +++ b/man/make_opts2procs.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{mapOption2Process} -\alias{mapOption2Process} +\name{make_opts2procs} +\alias{make_opts2procs} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -mapOption2Process() +make_opts2procs() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- mapOption2Process +example: list_opts2procs <- make_opts2procs } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/mapAdvOption2Process.Rd b/man/map_advanced_opts2procs.Rd similarity index 76% rename from man/mapAdvOption2Process.Rd rename to man/map_advanced_opts2procs.Rd index 5bd9ee65..631708b4 100644 --- a/man/mapAdvOption2Process.Rd +++ b/man/map_advanced_opts2procs.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{mapAdvOption2Process} -\alias{mapAdvOption2Process} +\name{map_advanced_opts2procs} +\alias{map_advanced_opts2procs} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -mapAdvOption2Process(advanced_opts) +map_advanced_opts2procs(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- mapAdvOption2Process(advanced_opts) +procs <- map_advanced_opts2procs(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/plotEstimatedWallTimes.Rd b/man/plot_estimated_walltimes.Rd similarity index 77% rename from man/plotEstimatedWallTimes.Rd rename to man/plot_estimated_walltimes.Rd index 0d53cb32..884fed50 100644 --- a/man/plotEstimatedWallTimes.Rd +++ b/man/plot_estimated_walltimes.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plotEstimatedWallTimes} -\alias{plotEstimatedWallTimes} +\name{plot_estimated_walltimes} +\alias{plot_estimated_walltimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plotEstimatedWallTimes() +plot_estimated_walltimes() } \value{ line plot object example: -p <- plotEstimatedWallTimes() +p <- plot_estimated_walltimes() ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } diff --git a/man/writeProcessRuntime2TSV.Rd b/man/write_proc_medians_table.Rd similarity index 77% rename from man/writeProcessRuntime2TSV.Rd rename to man/write_proc_medians_table.Rd index 03cbbd68..2ae7a97b 100644 --- a/man/writeProcessRuntime2TSV.Rd +++ b/man/write_proc_medians_table.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{writeProcessRuntime2TSV} -\alias{writeProcessRuntime2TSV} +\name{write_proc_medians_table} +\alias{write_proc_medians_table} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -writeProcessRuntime2TSV(dir_job_results, filepath) +write_proc_medians_table(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ writeProcessRuntime2TSV(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: writeProcessRuntime2TSV( +example: write_proc_medians_table( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/writeProcessRuntimeToYML.Rd b/man/write_proc_medians_yml.Rd similarity index 74% rename from man/writeProcessRuntimeToYML.Rd rename to man/write_proc_medians_yml.Rd index e4a5c8ad..74757f1f 100644 --- a/man/writeProcessRuntimeToYML.Rd +++ b/man/write_proc_medians_yml.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{writeProcessRuntimeToYML} -\alias{writeProcessRuntimeToYML} +\name{write_proc_medians_yml} +\alias{write_proc_medians_yml} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -writeProcessRuntimeToYML(dir_job_results, filepath = NULL) +write_proc_medians_yml(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} @@ -15,12 +15,12 @@ uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntimeToYML() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default read location. } \examples{ \dontrun{ -writeProcessRuntimeToYML( +write_proc_medians_yml( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From a543898c8579065cbe3125f40b8cdf66200fc06f Mon Sep 17 00:00:00 2001 From: Seyi Kuforiji Date: Thu, 10 Oct 2024 11:00:41 +0100 Subject: [PATCH 10/19] Renamed the following functions in R/assign_job_queue.R; MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | Original | Modified | User Facing | |---------------------------------|----------------------------------|----------------------------------| | assign_job_queue | assignJobQueue | ✔️ | | make_opts2procs | mapOption2Process | ✔️ | | map_advanced_opts2procs | mapAdvOption2Process | ✔️ | | get_proc_medians | calculateProcessRuntime | ✔️ | | write_proc_medians_table | writeProcessRuntime2TSV | ✔️ | | write_proc_medians_yml | writeProcessRuntime2YML | ✔️ | | get_proc_weights | getProcessRuntimeWeights | ✔️ | | advanced_opts2est_walltime | calculateEstimatedWallTimeFromOpts| ✔️ | | plot_estimated_walltimes | plotEstimatedWallTimes | ✔️ | --- NAMESPACE | 18 ++-- R/assign_job_queue.R | 86 +++++++++---------- ...{assign_job_queue.Rd => assignJobQueue.Rd} | 12 +-- ... => calculateEstimatedWallTimeFromOpts.Rd} | 10 +-- ..._medians.Rd => calculateProcessRuntime.Rd} | 10 +-- ...weights.Rd => getProcessRuntimeWeights.Rd} | 8 +- ..._opts2procs.Rd => mapAdvOption2Process.Rd} | 8 +- ...ake_opts2procs.Rd => mapOption2Process.Rd} | 8 +- ...walltimes.Rd => plotEstimatedWallTimes.Rd} | 8 +- ...ns_table.Rd => writeProcessRuntime2TSV.Rd} | 8 +- ...ians_yml.Rd => writeProcessRuntime2YML.Rd} | 10 +-- 11 files changed, 93 insertions(+), 93 deletions(-) rename man/{assign_job_queue.Rd => assignJobQueue.Rd} (68%) rename man/{advanced_opts2est_walltime.Rd => calculateEstimatedWallTimeFromOpts.Rd} (74%) rename man/{get_proc_medians.Rd => calculateProcessRuntime.Rd} (76%) rename man/{get_proc_weights.Rd => getProcessRuntimeWeights.Rd} (73%) rename man/{map_advanced_opts2procs.Rd => mapAdvOption2Process.Rd} (76%) rename man/{make_opts2procs.Rd => mapOption2Process.Rd} (75%) rename man/{plot_estimated_walltimes.Rd => plotEstimatedWallTimes.Rd} (77%) rename man/{write_proc_medians_table.Rd => writeProcessRuntime2TSV.Rd} (77%) rename man/{write_proc_medians_yml.Rd => writeProcessRuntime2YML.Rd} (74%) diff --git a/NAMESPACE b/NAMESPACE index c811bac3..65cc791e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,10 +15,11 @@ export(add_leaves) export(add_lins) export(add_name) export(add_tax) -export(advanced_opts2est_walltime) export(alignFasta) export(assert_count_df) -export(assign_job_queue) +export(assignJobQueue) +export(calculateEstimatedWallTimeFromOpts) +export(calculateProcessRuntime) export(cleanClusters) export(cleanDomainArchitecture) export(cleanGeneDescription) @@ -53,10 +54,9 @@ export(generate_all_aln2fa) export(generate_fa2tre) export(generate_msa) export(generate_trees) +export(getProcessRuntimeWeights) export(get_accnums_from_fasta_file) export(get_job_message) -export(get_proc_medians) -export(get_proc_weights) export(ipg2lin) export(ipr2viz) export(ipr2viz_web) @@ -66,12 +66,12 @@ export(lineage.domain_repeats.plot) export(lineage.neighbors.plot) export(lineage_sunburst) export(make_job_results_url) -export(make_opts2procs) export(mapAcc2Name) +export(mapAdvOption2Process) +export(mapOption2Process) export(map_acc2name) -export(map_advanced_opts2procs) export(msa_pdf) -export(plot_estimated_walltimes) +export(plotEstimatedWallTimes) export(prot2tax) export(prot2tax_old) export(removeAsterisks) @@ -103,8 +103,8 @@ export(wordcloud2_element) export(wordcloud3) export(wordcloud_element) export(write.MsaAAMultipleAlignment) -export(write_proc_medians_table) -export(write_proc_medians_yml) +export(writeProcessRuntime2TSV) +export(writeProcessRuntime2YML) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index c531fb09..10df1e3a 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -3,16 +3,16 @@ # pipeline. # to use this, construct paths like so: file.path(common_root, "path", "to", "file.R") # for example, the reference for this file would be: -# file.path(common_root, "molevol_scripts", "R", "assign_job_queue.R") +# file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes #' -#' example: list_opts2procs <- make_opts2procs +#' example: list_opts2procs <- mapOption2Process #' @export -make_opts2procs <- function() { +mapOption2Process <- function() { tryCatch({ opts2processes <- list( "homology_search" = c("dblast", "dblast_cleanup"), @@ -26,7 +26,7 @@ make_opts2procs <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("make_opts2procs function execution completed.") + message("mapOption2Process function execution completed.") }) } @@ -40,16 +40,16 @@ make_opts2procs <- function() { #' #' example: #' advanced_opts <- c("homology_search", "domain_architecture") -#' procs <- map_advanced_opts2procs(advanced_opts) +#' procs <- mapAdvOption2Process(advanced_opts) #' @export -map_advanced_opts2procs <- function(advanced_opts) { +mapAdvOption2Process <- function(advanced_opts) { if (!is.character(advanced_opts)) { stop("Argument must be a character vector!") } tryCatch({ # append 'always' to add procs that always run advanced_opts <- c(advanced_opts, "always") - opts2proc <- make_opts2procs() + opts2proc <- mapOption2Process() # setup index for opts2proc based on advanced options idx <- which(names(opts2proc) %in% advanced_opts) # extract processes that will run @@ -60,7 +60,7 @@ map_advanced_opts2procs <- function(advanced_opts) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("make_opts2procs function execution completed.") + message("mapOption2Process function execution completed.") }) } @@ -80,14 +80,14 @@ map_advanced_opts2procs <- function(advanced_opts) { #' #' 1) #' dir_job_results <- "/data/scratch/janani/molevolvr_out" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' #' 2) from outside container environment #' common_root <- "/data/molevolvr_transfer/molevolvr_dev" #' dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -#' list_proc_medians <- get_proc_medians(dir_job_results) +#' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export -get_proc_medians <- function(dir_job_results) { +calculateProcessRuntime <- function(dir_job_results) { tryCatch({ # Check if dir_job_results is a character string if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -139,7 +139,7 @@ get_proc_medians <- function(dir_job_results) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("get_proc_medians function execution completed.") + message("calculateProcessRuntime function execution completed.") }) } @@ -156,12 +156,12 @@ get_proc_medians <- function(dir_job_results) { #' #' @return [tbl_df] 2 columns: 1) process and 2) median seconds #' -#' example: write_proc_medians_table( +#' example: writeProcessRuntime2TSV( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.tsv" #' ) #' @export -write_proc_medians_table <- function(dir_job_results, filepath) { +writeProcessRuntime2TSV <- function(dir_job_results, filepath) { tryCatch({ # Error handling for input arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -175,7 +175,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { if (!is.character(filepath) || length(filepath) != 1) { stop("Input 'filepath' must be a single character string.") } - df_proc_medians <- get_proc_medians(dir_job_results) |> + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> tibble::as_tibble() |> tidyr::pivot_longer( dplyr::everything(), @@ -192,7 +192,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("write_proc_medians_table function execution completed.") + message("writeProcessRuntime2TSV function execution completed.") }) } @@ -201,7 +201,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -212,13 +212,13 @@ write_proc_medians_table <- function(dir_job_results, filepath) { #' #' @examples #' \dontrun{ -#' write_proc_medians_yml( +#' writeProcessRuntime2YML( #' "/data/scratch/janani/molevolvr_out/", #' "/data/scratch/janani/molevolvr_out/log_tbl.yml" #' ) #' } #' @export -write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { +writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { tryCatch({ # Error handling for dir_job_results arguments if (!is.character(dir_job_results) || length(dir_job_results) != 1) { @@ -238,14 +238,14 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { stop("Input 'filepath' must be a single character string.") } - medians <- get_proc_medians(dir_job_results) + medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) }, error = function(e) { message(paste("Encountered an error: "), e$message) }, warning = function(w) { message(paste("Warning: "), w$message) }, finally = { - message("write_proc_medians_table function execution completed.") + message("writeProcessRuntime2TSV function execution completed.") } ) @@ -261,9 +261,9 @@ write_proc_medians_yml <- function(dir_job_results, filepath = NULL) { #' #' @return [list] names: processes; values: median runtime (seconds) #' -#' example: write_proc_medians_yml() +#' example: writeProcessRuntime2YML() #' @export -get_proc_weights <- function(medians_yml_path = NULL) { +getProcessRuntimeWeights <- function(medians_yml_path = NULL) { if (is.null(medians_yml_path)) { medians_yml_path <- file.path(common_root, "molevol_scripts", @@ -273,7 +273,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { proc_weights <- tryCatch({ # attempt to read the weights from the YAML file produced by - # write_proc_medians_yml() + # writeProcessRuntime2YML() if (stringr::str_trim(medians_yml_path) == "") { stop( stringr::str_glue("medians_yml_path is empty @@ -285,7 +285,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { }, # to avoid fatal errors in reading the proc weights yaml, # some median process runtimes have been hardcoded based on - # the result of get_proc_medians() from Jan 2024 + # the result of calculateProcessRuntime() from Jan 2024 error = function(cond) { proc_weights <- list( "dblast" = 2810, @@ -306,7 +306,7 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' calculate the total estimated walltime for the job #' #' @param advanced_opts character vector of MolEvolvR advanced options -#' (see make_opts2procs for the options) +#' (see mapOption2Process for the options) #' @param n_inputs total number of input proteins #' #' @importFrom dplyr if_else @@ -314,11 +314,11 @@ get_proc_weights <- function(medians_yml_path = NULL) { #' #' @return total estimated number of seconds a job will process (walltime) #' -#' example: advanced_opts2est_walltime (c("homology_search", +#' example: calculateEstimatedWallTimeFromOpts (c("homology_search", #' "domain_architecture"), #' n_inputs = 3, n_hits = 50L) #' @export -advanced_opts2est_walltime <- function(advanced_opts, +calculateEstimatedWallTimeFromOpts <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { @@ -348,7 +348,7 @@ advanced_opts2est_walltime <- function(advanced_opts, } # Get process weights - proc_weights <- write_proc_medians_yml() + proc_weights <- writeProcessRuntime2YML() if (!is.list(proc_weights)) { stop("Process weights could not be retrieved correctly.") } @@ -357,7 +357,7 @@ advanced_opts2est_walltime <- function(advanced_opts, proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() all_procs <- names(proc_weights) |> sort() # get processes from advanced options and sort by names - procs_from_opts <- map_advanced_opts2procs(advanced_opts) + procs_from_opts <- mapAdvOption2Process(advanced_opts) procs_from_opts <- sort(procs_from_opts) # binary encode: yes proc will run (1); else 0 binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) @@ -366,7 +366,7 @@ advanced_opts2est_walltime <- function(advanced_opts, as.numeric() # calculate the additional processes to run for the homologous hits if ("homology_search" %in% advanced_opts) { - opts2procs <- make_opts2procs() + opts2procs <- mapOption2Process() # exclude the homology search processes for the homologous hits procs2exclude_for_homologs <- opts2procs[["homology_search"]] procs_homologs <- procs_from_opts[!(procs_from_opts @@ -380,7 +380,7 @@ advanced_opts2est_walltime <- function(advanced_opts, } if (verbose) { msg <- stringr::str_glue( - "warnings from advanced_opts2est_walltime ():\n", + "warnings from calculateEstimatedWallTimeFromOpts ():\n", "\tn_inputs={n_inputs}\n", "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", "\test_walltime={est_walltime}\n\n" @@ -393,7 +393,7 @@ advanced_opts2est_walltime <- function(advanced_opts, }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("advanced_opts2est_walltime + message("calculateEstimatedWallTimeFromOpts function execution completed.") }) @@ -403,18 +403,18 @@ advanced_opts2est_walltime <- function(advanced_opts, #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process -#' (from advanced_opts2est_walltime ()) +#' (from calculateEstimatedWallTimeFromOpts ()) #' @param t_long threshold value that defines the lower bound for assigning a #' job to the "long queue" #' #' @return a string of "short" or "long" #' #' example: -#' advanced_opts2est_walltime (c("homology_search", +#' calculateEstimatedWallTimeFromOpts (c("homology_search", #' "domain_architecture"), 3) |> -#' assign_job_queue() +#' assignJobQueue() #' @export -assign_job_queue <- function( +assignJobQueue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { @@ -434,7 +434,7 @@ assign_job_queue <- function( }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("assign_job_queue function execution completed.") + message("assignJobQueue function execution completed.") }) } @@ -451,13 +451,13 @@ assign_job_queue <- function( #' @return line plot object #' #' example: -#' p <- plot_estimated_walltimes() +#' p <- plotEstimatedWallTimes() #' ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export -plot_estimated_walltimes <- function() { +plotEstimatedWallTimes <- function() { tryCatch({ - opts <- make_opts2procs() |> names() + opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { # generate powerset (do not include empty set) @@ -482,7 +482,7 @@ plot_estimated_walltimes <- function() { } else { NULL } - est_walltime <- advanced_opts2est_walltime ( + est_walltime <- calculateEstimatedWallTimeFromOpts ( advanced_opts, n_inputs = i, n_hits = n_hits, @@ -541,7 +541,7 @@ plot_estimated_walltimes <- function() { }, warning = function(w) { message(paste("Warning: ", w$message)) }, finally = { - message("plot_estimated_walltimes function execution completed.") + message("plotEstimatedWallTimes function execution completed.") }) } diff --git a/man/assign_job_queue.Rd b/man/assignJobQueue.Rd similarity index 68% rename from man/assign_job_queue.Rd rename to man/assignJobQueue.Rd index d2650fed..3663ce56 100644 --- a/man/assign_job_queue.Rd +++ b/man/assignJobQueue.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{assign_job_queue} -\alias{assign_job_queue} +\name{assignJobQueue} +\alias{assignJobQueue} \title{Decision function to assign job queue} \usage{ -assign_job_queue(t_sec_estimate, t_cutoff = 21600) +assignJobQueue(t_sec_estimate, t_cutoff = 21600) } \arguments{ \item{t_sec_estimate}{estimated number of seconds a job will process -(from advanced_opts2est_walltime ())} +(from calculateEstimatedWallTimeFromOpts ())} \item{t_long}{threshold value that defines the lower bound for assigning a job to the "long queue"} @@ -17,9 +17,9 @@ job to the "long queue"} a string of "short" or "long" example: -advanced_opts2est_walltime (c("homology_search", +calculateEstimatedWallTimeFromOpts (c("homology_search", "domain_architecture"), 3) |> -assign_job_queue() +assignJobQueue() } \description{ Decision function to assign job queue diff --git a/man/advanced_opts2est_walltime.Rd b/man/calculateEstimatedWallTimeFromOpts.Rd similarity index 74% rename from man/advanced_opts2est_walltime.Rd rename to man/calculateEstimatedWallTimeFromOpts.Rd index 02ae9621..c09cf6a6 100644 --- a/man/advanced_opts2est_walltime.Rd +++ b/man/calculateEstimatedWallTimeFromOpts.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{advanced_opts2est_walltime} -\alias{advanced_opts2est_walltime} +\name{calculateEstimatedWallTimeFromOpts} +\alias{calculateEstimatedWallTimeFromOpts} \title{Given MolEvolvR advanced options and number of inputs, calculate the total estimated walltime for the job} \usage{ -advanced_opts2est_walltime( +calculateEstimatedWallTimeFromOpts( advanced_opts, n_inputs = 1L, n_hits = NULL, @@ -14,14 +14,14 @@ advanced_opts2est_walltime( } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options -(see make_opts2procs for the options)} +(see mapOption2Process for the options)} \item{n_inputs}{total number of input proteins} } \value{ total estimated number of seconds a job will process (walltime) -example: advanced_opts2est_walltime (c("homology_search", +example: calculateEstimatedWallTimeFromOpts (c("homology_search", "domain_architecture"), n_inputs = 3, n_hits = 50L) } diff --git a/man/get_proc_medians.Rd b/man/calculateProcessRuntime.Rd similarity index 76% rename from man/get_proc_medians.Rd rename to man/calculateProcessRuntime.Rd index b6db0b56..bb6dd1ed 100644 --- a/man/get_proc_medians.Rd +++ b/man/calculateProcessRuntime.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_medians} -\alias{get_proc_medians} +\name{calculateProcessRuntime} +\alias{calculateProcessRuntime} \title{Scrape MolEvolvR logs and calculate median processes} \usage{ -get_proc_medians(dir_job_results) +calculateProcessRuntime(dir_job_results) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -21,12 +21,12 @@ examples: } dir_job_results <- "/data/scratch/janani/molevolvr_out" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) \enumerate{ \item from outside container environment common_root <- "/data/molevolvr_transfer/molevolvr_dev" dir_job_results <- "/data/molevolvr_transfer/molevolvr_dev/job_results" -list_proc_medians <- get_proc_medians(dir_job_results) +list_proc_medians <- calculateProcessRuntime(dir_job_results) } } \description{ diff --git a/man/get_proc_weights.Rd b/man/getProcessRuntimeWeights.Rd similarity index 73% rename from man/get_proc_weights.Rd rename to man/getProcessRuntimeWeights.Rd index f48585cc..ff3c8e5d 100644 --- a/man/get_proc_weights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{get_proc_weights} -\alias{get_proc_weights} +\name{getProcessRuntimeWeights} +\alias{getProcessRuntimeWeights} \title{Quickly get the runtime weights for MolEvolvR backend processes} \usage{ -get_proc_weights(medians_yml_path = NULL) +getProcessRuntimeWeights(medians_yml_path = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results @@ -13,7 +13,7 @@ directory} \value{ \link{list} names: processes; values: median runtime (seconds) -example: write_proc_medians_yml() +example: writeProcessRuntime2YML() } \description{ Quickly get the runtime weights for MolEvolvR backend processes diff --git a/man/map_advanced_opts2procs.Rd b/man/mapAdvOption2Process.Rd similarity index 76% rename from man/map_advanced_opts2procs.Rd rename to man/mapAdvOption2Process.Rd index 631708b4..5bd9ee65 100644 --- a/man/map_advanced_opts2procs.Rd +++ b/man/mapAdvOption2Process.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{map_advanced_opts2procs} -\alias{map_advanced_opts2procs} +\name{mapAdvOption2Process} +\alias{mapAdvOption2Process} \title{Use MolEvolvR advanced options to get associated processes} \usage{ -map_advanced_opts2procs(advanced_opts) +mapAdvOption2Process(advanced_opts) } \arguments{ \item{advanced_opts}{character vector of MolEvolvR advanced options} @@ -15,7 +15,7 @@ the advanced options example: advanced_opts <- c("homology_search", "domain_architecture") -procs <- map_advanced_opts2procs(advanced_opts) +procs <- mapAdvOption2Process(advanced_opts) } \description{ Use MolEvolvR advanced options to get associated processes diff --git a/man/make_opts2procs.Rd b/man/mapOption2Process.Rd similarity index 75% rename from man/make_opts2procs.Rd rename to man/mapOption2Process.Rd index 07e208b2..ff6905c5 100644 --- a/man/make_opts2procs.Rd +++ b/man/mapOption2Process.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{make_opts2procs} -\alias{make_opts2procs} +\name{mapOption2Process} +\alias{mapOption2Process} \title{Construct list where names (MolEvolvR advanced options) point to processes} \usage{ -make_opts2procs() +mapOption2Process() } \value{ list where names (MolEvolvR advanced options) point to processes -example: list_opts2procs <- make_opts2procs +example: list_opts2procs <- mapOption2Process } \description{ Construct list where names (MolEvolvR advanced options) point to processes diff --git a/man/plot_estimated_walltimes.Rd b/man/plotEstimatedWallTimes.Rd similarity index 77% rename from man/plot_estimated_walltimes.Rd rename to man/plotEstimatedWallTimes.Rd index 884fed50..0d53cb32 100644 --- a/man/plot_estimated_walltimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -1,17 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{plot_estimated_walltimes} -\alias{plot_estimated_walltimes} +\name{plotEstimatedWallTimes} +\alias{plotEstimatedWallTimes} \title{Plot the estimated runtimes for different advanced options and number of inputs} \usage{ -plot_estimated_walltimes() +plotEstimatedWallTimes() } \value{ line plot object example: -p <- plot_estimated_walltimes() +p <- plotEstimatedWallTimes() ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } diff --git a/man/write_proc_medians_table.Rd b/man/writeProcessRuntime2TSV.Rd similarity index 77% rename from man/write_proc_medians_table.Rd rename to man/writeProcessRuntime2TSV.Rd index 2ae7a97b..03cbbd68 100644 --- a/man/write_proc_medians_table.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_table} -\alias{write_proc_medians_table} +\name{writeProcessRuntime2TSV} +\alias{writeProcessRuntime2TSV} \title{Write a table of 2 columns: 1) process and 2) median seconds} \usage{ -write_proc_medians_table(dir_job_results, filepath) +writeProcessRuntime2TSV(dir_job_results, filepath) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results} @@ -14,7 +14,7 @@ write_proc_medians_table(dir_job_results, filepath) \value{ \link{tbl_df} 2 columns: 1) process and 2) median seconds -example: write_proc_medians_table( +example: writeProcessRuntime2TSV( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.tsv" ) diff --git a/man/write_proc_medians_yml.Rd b/man/writeProcessRuntime2YML.Rd similarity index 74% rename from man/write_proc_medians_yml.Rd rename to man/writeProcessRuntime2YML.Rd index 74757f1f..b43f39ee 100644 --- a/man/write_proc_medians_yml.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/assign_job_queue.R -\name{write_proc_medians_yml} -\alias{write_proc_medians_yml} +\name{writeProcessRuntime2YML} +\alias{writeProcessRuntime2YML} \title{Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'.} \usage{ -write_proc_medians_yml(dir_job_results, filepath = NULL) +writeProcessRuntime2YML(dir_job_results, filepath = NULL) } \arguments{ \item{dir_job_results}{\link{chr} path to MolEvolvR job_results directory} @@ -15,12 +15,12 @@ uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which write_proc_medians_yml() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default read location. } \examples{ \dontrun{ -write_proc_medians_yml( +writeProcessRuntime2YML( "/data/scratch/janani/molevolvr_out/", "/data/scratch/janani/molevolvr_out/log_tbl.yml" ) From e9460610fb054c1c3109cf728561efe2e6619104 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:09:40 -0600 Subject: [PATCH 11/19] remove outdated .Rd --- man/GCA2lin.Rd | 0 man/acc2lin.Rd | 57 ----------------------------------------------- man/efetch_ipg.Rd | 0 man/ipg2lin.Rd | 0 man/sink.reset.Rd | 0 5 files changed, 57 deletions(-) delete mode 100644 man/GCA2lin.Rd delete mode 100644 man/acc2lin.Rd delete mode 100644 man/efetch_ipg.Rd delete mode 100644 man/ipg2lin.Rd delete mode 100644 man/sink.reset.Rd diff --git a/man/GCA2lin.Rd b/man/GCA2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/acc2lin.Rd b/man/acc2lin.Rd deleted file mode 100644 index d3f2468b..00000000 --- a/man/acc2lin.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/acc2lin.R, R/lineage.R -\name{acc2lin} -\alias{acc2lin} -\title{acc2lin} -\usage{ -acc2lin( - accessions, - assembly_path, - lineagelookup_path, - ipgout_path = NULL, - plan = "multicore" -) - -acc2lin( - accessions, - assembly_path, - lineagelookup_path, - ipgout_path = NULL, - plan = "multicore" -) -} -\arguments{ -\item{accessions}{Character vector of protein accessions} - -\item{assembly_path}{String of the path to the assembly_summary path -This file can be generated using the "DownloadAssemblySummary()" function} - -\item{lineagelookup_path}{String of the path to the lineage lookup file -(taxid to lineage mapping). This file can be generated using the} - -\item{ipgout_path}{Path to write the results of the efetch run of the accessions -on the ipg database. If NULL, the file will not be written. Defaults to NULL} - -\item{plan}{} -} -\value{ -Describe return, in detail -} -\description{ -This function combines 'efetch_ipg()' -and 'ipg2lin()' to map a set -of protein accessions to their assembly (GCA_ID), tax ID, and lineage. - -Function to map protein accession numbers to lineage - -This function combines 'efetch_ipg()' and 'ipg2lin()' to map a set -of protein accessions to their assembly (GCA_ID), tax ID, and lineage. -} -\examples{ -\dontrun{ -acc2lin() -} -} -\author{ -Samuel Chen, Janani Ravi -} diff --git a/man/efetch_ipg.Rd b/man/efetch_ipg.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/ipg2lin.Rd b/man/ipg2lin.Rd deleted file mode 100644 index e69de29b..00000000 diff --git a/man/sink.reset.Rd b/man/sink.reset.Rd deleted file mode 100644 index e69de29b..00000000 From 9571333c44ac879d9b2b6bc1a38d454fdda69a39 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:10 -0600 Subject: [PATCH 12/19] let R sort NAMESPACE --- NAMESPACE | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 60bec5b1..c448ff13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,9 +11,7 @@ export(addLeaves2Alignment) export(addLineage) export(addName) export(addTaxID) -export(advanced_opts2est_walltime) export(alignFasta) -export(assert_count_df) export(assignJobQueue) export(calculateEstimatedWallTimeFromOpts) export(calculateProcessRuntime) @@ -35,9 +33,9 @@ export(countByColumn) export(createFA2Tree) export(createJobResultsURL) export(createJobStatusEmailMessage) +export(createLineageLookup) export(createRepresentativeAccNum) export(createWordCloud2Element) -export(createLineageLookup) export(createWordCloudElement) export(domain_network) export(downloadAssemblySummary) @@ -50,14 +48,14 @@ export(formatJobArgumentsHTML) export(gc_undirected_network) export(generateAllAlignments2FA) export(generate_msa) -export(getProcessRuntimeWeights) export(getAccNumFromFA) +export(getProcessRuntimeWeights) export(getTopAccByLinDomArch) export(mapAcc2Name) export(mapAdvOption2Process) export(mapOption2Process) -export(map_acc2name) export(msa_pdf) +export(plotEstimatedWallTimes) export(plotIPR2Viz) export(plotIPR2VizWeb) export(plotLineageDA) @@ -70,12 +68,10 @@ export(plotStackedLineage) export(plotSunburst) export(plotTreemap) export(plotUpSet) -export(plotEstimatedWallTimes) export(prepareColumnParams) export(prepareSingleColumnParams) export(proteinAcc2TaxID) export(proteinAcc2TaxID_old) -export(prot2tax_old) export(removeAsterisks) export(removeEmptyRows) export(removeTails) From 8c573693b92f2aa216b269e24244d2d63fe0d3a9 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:26 -0600 Subject: [PATCH 13/19] regen new .Rd --- man/GCA2Lineage.Rd | 2 +- man/IPG2Lineage.Rd | 5 +++-- man/efetchIPG.Rd | 3 ++- man/sinkReset.Rd | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/man/GCA2Lineage.Rd b/man/GCA2Lineage.Rd index 9ec0ce56..9a2a7a30 100644 --- a/man/GCA2Lineage.Rd +++ b/man/GCA2Lineage.Rd @@ -19,7 +19,7 @@ This file can be generated using the "downloadAssemblySummary()" function} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{acc_col}{} } diff --git a/man/IPG2Lineage.Rd b/man/IPG2Lineage.Rd index 282d5cbf..118812ab 100644 --- a/man/IPG2Lineage.Rd +++ b/man/IPG2Lineage.Rd @@ -29,7 +29,7 @@ file} \item{lineagelookup_path}{String of the path to the lineage lookup file (taxid to lineage mapping). This file can be generated using the -"create_lineage_lookup()" function} +"createLineageLookup()" function} \item{assembly_path}{String of the path to the assembly_summary path This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function} @@ -39,7 +39,8 @@ A \code{data.table} with the lineage information for the provided protein accessions. } \description{ -Takes the resulting file of an efetch run on the ipg database and +Takes the resulting file +of an efetch run on the ipg database and Takes the resulting file of an efetch run on the ipg database and append lineage, and taxid columns diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index 047e2652..db63024f 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -23,7 +23,8 @@ the ipg database} No return value. The function writes the fetched results to \code{out_path}. } \description{ -Perform efetch on the ipg database and write the results to out_path +Perform efetch on the ipg database +and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index 0285c0b2..e3fc7ce4 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,6 +8,7 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s +and handles any errors or warnings that occur during the process. } \description{ Sink Reset From 2061d7a24b7a699bfeac72270817ae7225365ffa Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sat, 12 Oct 2024 14:10:48 -0600 Subject: [PATCH 14/19] remove old tryCatch code (for now) --- R/acc2lin.R | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 42315ece..a0a95033 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -72,14 +72,6 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE) return(merged) - }, error = function(e) { - print(paste("Error: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("addLineages function execution completed.") - }) - } @@ -247,13 +239,6 @@ IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, lins <- lins[!is.na(Lineage)] %>% unique() return(lins) - }, error = function(e) { - print(paste("An error occurred: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("ipg2lin function execution completed.") - }) } From 70f0de8c57d610eaad122e59d4bf1e96fc455963 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sun, 13 Oct 2024 19:21:41 -0600 Subject: [PATCH 15/19] remove code not relevant to PR --- R/acc2lin.R | 50 +++--- R/assign_job_queue.R | 359 +++++++++++++------------------------------ R/blastWrappers.R | 105 +++---------- 3 files changed, 153 insertions(+), 361 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index a0a95033..61aae87c 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -157,40 +157,34 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { return(partitioned) } - tryCatch({ - # Set the future plan strategy - plan(strategy = plan, .skip = T) + # Set the future plan strategy + plan(strategy = plan, .skip = T) - min_groups <- length(accnums) / 200 - groups <- min(max(min_groups, 15), length(accnums)) - partitioned_acc <- partition(accnums, groups) - # Open the sink to the output path - sink(out_path) + min_groups <- length(accnums) / 200 + groups <- min(max(min_groups, 15), length(accnums)) + partitioned_acc <- partition(accnums, groups) - a <- future_map(1:length(partitioned_acc), function(x) { - # Avoid hitting the rate API limit - if (x %% 9 == 0) { - Sys.sleep(1) - } - cat( - entrez_fetch( - id = partitioned_acc[[x]], - db = "ipg", - rettype = "xml", - api_key = "YOUR_KEY_HERE" ## Can this be included in public package? - ) + # Open the sink to the output path + sink(out_path) + + a <- future_map(1:length(partitioned_acc), function(x) { + # Avoid hitting the rate API limit + if (x %% 9 == 0) { + Sys.sleep(1) + } + cat( + entrez_fetch( + id = partitioned_acc[[x]], + db = "ipg", + rettype = "xml", + api_key = "YOUR_KEY_HERE" ## Can this be included in public package? ) - }) - sink(NULL) - }, error = function(e) { - print(paste("An error occurred: ", e$message)) - }, warning = function(w) { - print(paste("Warning: ", w$message)) - }, finally = { - print("efetch_ipg function execution completed.") + ) }) + sink(NULL) + } } diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 10df1e3a..4791b4a1 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -13,22 +13,13 @@ common_root <- Sys.getenv("COMMON_SRC_ROOT") #' example: list_opts2procs <- mapOption2Process #' @export mapOption2Process <- function() { - tryCatch({ - opts2processes <- list( - "homology_search" = c("dblast", "dblast_cleanup"), - "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), - # processes always present agnostic of advanced options - "always" = c("blast_clust", "clust2table") - ) - return(opts2processes) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("mapOption2Process function execution completed.") - }) - + opts2processes <- list( + "homology_search" = c("dblast", "dblast_cleanup"), + "domain_architecture" = c("iprscan", "ipr2lineage", "ipr2da"), + # processes always present agnostic of advanced options + "always" = c("blast_clust", "clust2table") + ) + return(opts2processes) } #' Use MolEvolvR advanced options to get associated processes @@ -43,26 +34,14 @@ mapOption2Process <- function() { #' procs <- mapAdvOption2Process(advanced_opts) #' @export mapAdvOption2Process <- function(advanced_opts) { - if (!is.character(advanced_opts)) { - stop("Argument must be a character vector!") - } - tryCatch({ - # append 'always' to add procs that always run - advanced_opts <- c(advanced_opts, "always") - opts2proc <- mapOption2Process() - # setup index for opts2proc based on advanced options - idx <- which(names(opts2proc) %in% advanced_opts) - # extract processes that will run - procs <- opts2proc[idx] |> unlist() - return(procs) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("mapOption2Process function execution completed.") - }) - + # append 'always' to add procs that always run + advanced_opts <- c(advanced_opts, "always") + opts2proc <- mapOption2Process() + # setup index for opts2proc based on advanced options + idx <- which(names(opts2proc) %in% advanced_opts) + # extract processes that will run + procs <- opts2proc[idx] |> unlist() + return(procs) } #' Scrape MolEvolvR logs and calculate median processes @@ -88,60 +67,41 @@ mapAdvOption2Process <- function(advanced_opts) { #' list_proc_medians <- calculateProcessRuntime(dir_job_results) #' @export calculateProcessRuntime <- function(dir_job_results) { - tryCatch({ - # Check if dir_job_results is a character string - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } + source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) - # Check if dir_job_results exists - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } + # aggregate logs from + path_log_data <- file.path(common_root, + "molevol_scripts", "log_data", "prod_logs.rda") - source(file.path(common_root, "molevol_scripts", "R", "metrics.R")) - - # aggregate logs from - path_log_data <- file.path(common_root, - "molevol_scripts", "log_data", "prod_logs.rda") - - # ensure the folder exists to the location - if (!dir.exists(path_log_data)) { - dir.create(dirname(path_log_data), - recursive = TRUE, showWarnings = FALSE) - } - - # attempt to load pre-generated logdata - if (!file.exists(path_log_data)) { - logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) - save(logs, file = path_log_data) - } else { - load(path_log_data) # loads the logs object - } - df_log <- logs$df_log - procs <- c( - "dblast", "dblast_cleanup", "iprscan", - "ipr2lineage", "ipr2da", "blast_clust", - "clust2table" - ) - list_proc_medians <- df_log |> - dplyr::select(dplyr::all_of(procs)) |> - dplyr::summarise( - dplyr::across( - dplyr::everything(), - \(x) median(x, na.rm = TRUE) - ) - ) |> - as.list() - return(list_proc_medians) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("calculateProcessRuntime function execution completed.") - }) + # ensure the folder exists to the location + if (!dir.exists(path_log_data)) { + dir.create(dirname(path_log_data), + recursive = TRUE, showWarnings = FALSE) + } + # attempt to load pre-generated logdata + if (!file.exists(path_log_data)) { + logs <- aggregate_logs(dir_job_results, latest_date = Sys.Date() - 60) + save(logs, file = path_log_data) + } else { + load(path_log_data) # loads the logs object + } + df_log <- logs$df_log + procs <- c( + "dblast", "dblast_cleanup", "iprscan", + "ipr2lineage", "ipr2da", "blast_clust", + "clust2table" + ) + list_proc_medians <- df_log |> + dplyr::select(dplyr::all_of(procs)) |> + dplyr::summarise( + dplyr::across( + dplyr::everything(), + \(x) median(x, na.rm = TRUE) + ) + ) |> + as.list() + return(list_proc_medians) } #' Write a table of 2 columns: 1) process and 2) median seconds @@ -162,39 +122,18 @@ calculateProcessRuntime <- function(dir_job_results) { #' ) #' @export writeProcessRuntime2TSV <- function(dir_job_results, filepath) { - tryCatch({ - # Error handling for input arguments - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } - - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } - - if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") - } - df_proc_medians <- calculateProcessRuntime(dir_job_results) |> - tibble::as_tibble() |> - tidyr::pivot_longer( - dplyr::everything(), - names_to = "process", - values_to = "median_seconds" - ) |> - dplyr::arrange(dplyr::desc(median_seconds)) - - # Write the resulting tibble to a TSV file - readr::write_tsv(df_proc_medians, file = filepath) - return(df_proc_medians) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("writeProcessRuntime2TSV function execution completed.") - }) - + df_proc_medians <- calculateProcessRuntime(dir_job_results) |> + tibble::as_tibble() |> + tidyr::pivot_longer( + dplyr::everything(), + names_to = "process", + values_to = "median_seconds" + ) |> + dplyr::arrange(dplyr::desc(median_seconds)) + + # Write the resulting tibble to a TSV file + readr::write_tsv(df_proc_medians, file = filepath) + return(df_proc_medians) } #' Compute median process runtimes, then write a YAML list of the processes and @@ -219,36 +158,8 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' } #' @export writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { - tryCatch({ - # Error handling for dir_job_results arguments - if (!is.character(dir_job_results) || length(dir_job_results) != 1) { - stop("Input 'dir_job_results' must be a single character string.") - } - - if (!dir.exists(dir_job_results)) { - stop(paste("The directory", dir_job_results, "does not exist.")) - } - if (is.null(filepath)) { - filepath <- file.path(common_root, - "molevol_scripts", - "log_data", - "job_proc_weights.yml") - } - if (!is.character(filepath) || length(filepath) != 1) { - stop("Input 'filepath' must be a single character string.") - } - - medians <- calculateProcessRuntime(dir_job_results) - yaml::write_yaml(medians, filepath) - }, error = function(e) { - message(paste("Encountered an error: "), e$message) - }, warning = function(w) { - message(paste("Warning: "), w$message) - }, finally = { - message("writeProcessRuntime2TSV function execution completed.") - } - ) - + medians <- calculateProcessRuntime(dir_job_results) + yaml::write_yaml(medians, filepath) } #' Quickly get the runtime weights for MolEvolvR backend processes @@ -322,81 +233,49 @@ calculateEstimatedWallTimeFromOpts <- function(advanced_opts, n_inputs = 1L, n_hits = NULL, verbose = FALSE) { - - tryCatch({ - # to calculate est walltime for a homology search job, the number of hits - # must be provided - validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts - stopifnot(!validation_fail) - - # Validate advanced_opts - if (!is.character(advanced_opts)) { - stop("Argument 'advanced_opts' must be a character vector.") - } - - # Validate n_inputs - if (!is.numeric(n_inputs) || length(n_inputs) != 1 || n_inputs <= 0) { - stop("Argument 'n_inputs' must be a single positive numeric value.") - } - - # Validate n_hits if homology_search is in advanced_opts - if ("homology_search" %in% advanced_opts && - (is.null(n_hits)|| !is.numeric(n_hits) - || length(n_hits) != 1 || n_hits < 0)) { - stop("Argument 'n_hits' must be a single non-negative numeric value when - 'homology_search' is in 'advanced_opts'.") - } - - # Get process weights - proc_weights <- writeProcessRuntime2YML() - if (!is.list(proc_weights)) { - stop("Process weights could not be retrieved correctly.") - } - - # sort process weights by names and convert to vec - proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() - all_procs <- names(proc_weights) |> sort() - # get processes from advanced options and sort by names - procs_from_opts <- mapAdvOption2Process(advanced_opts) - procs_from_opts <- sort(procs_from_opts) - # binary encode: yes proc will run (1); else 0 - binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) - # dot product of weights and procs to run; scaled by the number of inputs - est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> - as.numeric() - # calculate the additional processes to run for the homologous hits - if ("homology_search" %in% advanced_opts) { - opts2procs <- mapOption2Process() - # exclude the homology search processes for the homologous hits - procs2exclude_for_homologs <- opts2procs[["homology_search"]] - procs_homologs <- procs_from_opts[!(procs_from_opts - %in% procs2exclude_for_homologs)] - binary_proc_vec_homolog <- dplyr::if_else(all_procs - %in% procs_homologs, 1L, 0L) - # add the estimated walltime for processes run on the homologous hits - est_walltime <- est_walltime + - (n_hits * (binary_proc_vec_homolog - %*% proc_weights) |> as.numeric()) - } - if (verbose) { - msg <- stringr::str_glue( - "warnings from calculateEstimatedWallTimeFromOpts ():\n", - "\tn_inputs={n_inputs}\n", - "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", - "\test_walltime={est_walltime}\n\n" - ) - cat(file = stderr(), msg) - } - return(est_walltime) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("calculateEstimatedWallTimeFromOpts - function execution completed.") - }) - + # to calculate est walltime for a homology search job, the number of hits + # must be provided + validation_fail <- is.null(n_hits) && "homology_search" %in% advanced_opts + stopifnot(!validation_fail) + + # Get process weights + proc_weights <- writeProcessRuntime2YML() + + # sort process weights by names and convert to vec + proc_weights <- proc_weights[order(names(proc_weights))] |> unlist() + all_procs <- names(proc_weights) |> sort() + # get processes from advanced options and sort by names + procs_from_opts <- mapAdvOption2Process(advanced_opts) + procs_from_opts <- sort(procs_from_opts) + # binary encode: yes proc will run (1); else 0 + binary_proc_vec <- dplyr::if_else(all_procs %in% procs_from_opts, 1L, 0L) + # dot product of weights and procs to run; scaled by the number of inputs + est_walltime <- (n_inputs * (binary_proc_vec %*% proc_weights)) |> + as.numeric() + # calculate the additional processes to run for the homologous hits + if ("homology_search" %in% advanced_opts) { + opts2procs <- mapOption2Process() + # exclude the homology search processes for the homologous hits + procs2exclude_for_homologs <- opts2procs[["homology_search"]] + procs_homologs <- procs_from_opts[!(procs_from_opts + %in% procs2exclude_for_homologs)] + binary_proc_vec_homolog <- dplyr::if_else(all_procs + %in% procs_homologs, 1L, 0L) + # add the estimated walltime for processes run on the homologous hits + est_walltime <- est_walltime + + (n_hits * (binary_proc_vec_homolog + %*% proc_weights) |> as.numeric()) + } + if (verbose) { + msg <- stringr::str_glue( + "warnings from calculateEstimatedWallTimeFromOpts ():\n", + "\tn_inputs={n_inputs}\n", + "\tn_hits={ifelse(is.null(n_hits), 'null', n_hits)}\n", + "\test_walltime={est_walltime}\n\n" + ) + cat(file = stderr(), msg) + } + return(est_walltime) } @@ -418,25 +297,8 @@ assignJobQueue <- function( t_sec_estimate, t_cutoff = 21600 # 6 hours ) { - tryCatch({ - if (!is.numeric(t_sec_estimate) || length(t_sec_estimate) != 1) { - stop("Argument 't_sec_estimate' must be a single numeric value.") - } - - if (!is.numeric(t_cutoff) || length(t_cutoff) != 1 || t_cutoff < 0) { - stop("Argument 't_cutoff' must be a single non-negative numeric value.") - } - - queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") - return(queue) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("assignJobQueue function execution completed.") - }) - + queue <- ifelse(t_sec_estimate > t_cutoff, "long", "short") + return(queue) } #' Plot the estimated runtimes for different advanced options and number @@ -456,7 +318,6 @@ assignJobQueue <- function( #' dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) #' @export plotEstimatedWallTimes <- function() { - tryCatch({ opts <- mapOption2Process() |> names() # get all possible submission permutations (powerset) get_powerset <- function(vec) { @@ -536,12 +397,4 @@ plotEstimatedWallTimes <- function() { y = "Estimated walltime (hours)" ) return(p) - }, error = function(e) { - message(paste("Encountered an error: ", e$message)) - }, warning = function(w) { - message(paste("Warning: ", w$message)) - }, finally = { - message("plotEstimatedWallTimes function execution completed.") - }) - } diff --git a/R/blastWrappers.R b/R/blastWrappers.R index 15484a1b..9b55f3ee 100755 --- a/R/blastWrappers.R +++ b/R/blastWrappers.R @@ -21,52 +21,24 @@ run_deltablast <- function(deltablast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_alignments, num_threads = 1) { - # Argument validation - if (!file.exists(deltablast_path)) { - stop("The DELTABLAST executable path is invalid: ", deltablast_path) - } - if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) - } - if (!file.exists(query)) { - stop("The query file path is invalid: ", query) - } - if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) - } - if (!is.numeric(num_alignments) || num_alignments <= 0) { - stop("The number of alignments must be a - positive integer: ", num_alignments) - } - if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) - } - start <- Sys.time() - tryCatch({ - system(paste0("export BLASTDB=/", db_search_path)) - system2( - command = deltablast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads, - "-num_alignments", num_alignments - # ,"-outfmt", outfmt - ) + system(paste0("export BLASTDB=/", db_search_path)) + + system2( + command = deltablast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads, + "-num_alignments", num_alignments + # ,"-outfmt", outfmt ) - print(Sys.time() - start) - }, error = function(e) { - message(paste("Error in run_deltablast: ", e)) - }, warning = function(w) { - message(paste("Warning in run_deltablast: ", w)) - }, finally = { - message("run_deltablast completed") - }) + ) + print(Sys.time() - start) } @@ -88,46 +60,19 @@ run_deltablast <- function(deltablast_path, db_search_path, run_rpsblast <- function(rpsblast_path, db_search_path, db = "refseq", query, evalue = "1e-5", out, num_threads = 1) { - # Argument validation - if (!file.exists(rpsblast_path)) { - stop("The RPSBLAST executable path is invalid: ", rpsblast_path) - } - if (!dir.exists(db_search_path)) { - stop("The database search path is invalid: ", db_search_path) - } - if (!file.exists(query)) { - stop("The query file path is invalid: ", query) - } - if (!is.numeric(as.numeric(evalue)) || as.numeric(evalue) <= 0) { - stop("The evalue must be a positive number: ", evalue) - } - if (!is.numeric(num_threads) || num_threads <= 0) { - stop("The number of threads must be a positive integer: ", num_threads) - } start <- Sys.time() + system(paste0("export BLASTDB=/", db_search_path)) - tryCatch({ - - system(paste0("export BLASTDB=/", db_search_path)) - - system2( - command = rpsblast_path, - args = c( - "-db", db, - "-query", query, - "-evalue", evalue, - "-out", out, - "-num_threads", num_threads - ) + system2( + command = rpsblast_path, + args = c( + "-db", db, + "-query", query, + "-evalue", evalue, + "-out", out, + "-num_threads", num_threads ) - print(Sys.time() - start) - }, error = function(e) { - message(paste("Error in run_rpsblast: ", e)) - }, warning = function(w) { - message(paste("Warning in run_rpsblast: ", w)) - }, finally = { - message("run_rpsblast completed") - }) - + ) + print(Sys.time() - start) } From 392775de92dfc33b198b41a5a2843f5313dd2e0d Mon Sep 17 00:00:00 2001 From: David Mayer Date: Sun, 13 Oct 2024 19:43:58 -0600 Subject: [PATCH 16/19] adjust .Rd title tags for renamed functions --- R/assign_job_queue.R | 27 +++++++++++++++++++++++ R/create_lineage_lookup.R | 3 +++ man/assignJobQueue.Rd | 2 +- man/calculateEstimatedWallTimeFromOpts.Rd | 3 +-- man/calculateProcessRuntime.Rd | 2 +- man/createLineageLookup.Rd | 2 +- man/getProcessRuntimeWeights.Rd | 2 +- man/mapAdvOption2Process.Rd | 2 +- man/mapOption2Process.Rd | 2 +- man/plotEstimatedWallTimes.Rd | 6 +++-- man/writeProcessRuntime2TSV.Rd | 2 +- man/writeProcessRuntime2YML.Rd | 6 +++-- 12 files changed, 46 insertions(+), 13 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 4791b4a1..20ba841f 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -6,6 +6,9 @@ # file.path(common_root, "molevol_scripts", "R", "assignJobQueue.R") common_root <- Sys.getenv("COMMON_SRC_ROOT") +#' mapOption2Process +#' +#' @description #' Construct list where names (MolEvolvR advanced options) point to processes #' #' @return list where names (MolEvolvR advanced options) point to processes @@ -22,6 +25,9 @@ mapOption2Process <- function() { return(opts2processes) } +#' mapAdvOption2Process +#' +#' @description #' Use MolEvolvR advanced options to get associated processes #' #' @param advanced_opts character vector of MolEvolvR advanced options @@ -44,6 +50,9 @@ mapAdvOption2Process <- function(advanced_opts) { return(procs) } +#' calculateProcessRuntime +#' +#' @description #' Scrape MolEvolvR logs and calculate median processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -104,6 +113,9 @@ calculateProcessRuntime <- function(dir_job_results) { return(list_proc_medians) } +#' writeProcessRuntime2TSV +#' +#' @description #' Write a table of 2 columns: 1) process and 2) median seconds #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -136,6 +148,9 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { return(df_proc_medians) } +#' writeProcessRuntime2YML +#' +#' @description #' Compute median process runtimes, then write a YAML list of the processes and #' their median runtimes in seconds to the path specified by 'filepath'. #' @@ -162,6 +177,9 @@ writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { yaml::write_yaml(medians, filepath) } +#' getProcessRuntimeWeights +#' +#' @description #' Quickly get the runtime weights for MolEvolvR backend processes #' #' @param dir_job_results [chr] path to MolEvolvR job_results @@ -213,6 +231,9 @@ getProcessRuntimeWeights <- function(medians_yml_path = NULL) { return(proc_weights) } +#' calculateEstimatedWallTimeFromOpts +#' +#' @description #' Given MolEvolvR advanced options and number of inputs, #' calculate the total estimated walltime for the job #' @@ -279,6 +300,9 @@ calculateEstimatedWallTimeFromOpts <- function(advanced_opts, } +#' assignJobQueue +#' +#' @description #' Decision function to assign job queue #' #' @param t_sec_estimate estimated number of seconds a job will process @@ -301,6 +325,9 @@ assignJobQueue <- function( return(queue) } +#' plotEstimatedWallTimes +#' +#' @description #' Plot the estimated runtimes for different advanced options and number #' of inputs #' diff --git a/R/create_lineage_lookup.R b/R/create_lineage_lookup.R index 78e79048..2408c5e6 100644 --- a/R/create_lineage_lookup.R +++ b/R/create_lineage_lookup.R @@ -3,6 +3,9 @@ # library(biomartr) +#' createLineageLookup +#' +#' @description #' Create a look up table that goes from TaxID, to Lineage #' #' @author Samuel Chen diff --git a/man/assignJobQueue.Rd b/man/assignJobQueue.Rd index 3663ce56..de646a82 100644 --- a/man/assignJobQueue.Rd +++ b/man/assignJobQueue.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{assignJobQueue} \alias{assignJobQueue} -\title{Decision function to assign job queue} +\title{assignJobQueue} \usage{ assignJobQueue(t_sec_estimate, t_cutoff = 21600) } diff --git a/man/calculateEstimatedWallTimeFromOpts.Rd b/man/calculateEstimatedWallTimeFromOpts.Rd index c09cf6a6..d5361001 100644 --- a/man/calculateEstimatedWallTimeFromOpts.Rd +++ b/man/calculateEstimatedWallTimeFromOpts.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{calculateEstimatedWallTimeFromOpts} \alias{calculateEstimatedWallTimeFromOpts} -\title{Given MolEvolvR advanced options and number of inputs, -calculate the total estimated walltime for the job} +\title{calculateEstimatedWallTimeFromOpts} \usage{ calculateEstimatedWallTimeFromOpts( advanced_opts, diff --git a/man/calculateProcessRuntime.Rd b/man/calculateProcessRuntime.Rd index bb6dd1ed..579ea2b6 100644 --- a/man/calculateProcessRuntime.Rd +++ b/man/calculateProcessRuntime.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{calculateProcessRuntime} \alias{calculateProcessRuntime} -\title{Scrape MolEvolvR logs and calculate median processes} +\title{calculateProcessRuntime} \usage{ calculateProcessRuntime(dir_job_results) } diff --git a/man/createLineageLookup.Rd b/man/createLineageLookup.Rd index 5dbab978..132019ce 100644 --- a/man/createLineageLookup.Rd +++ b/man/createLineageLookup.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/create_lineage_lookup.R \name{createLineageLookup} \alias{createLineageLookup} -\title{Create a look up table that goes from TaxID, to Lineage} +\title{createLineageLookup} \usage{ createLineageLookup( lineage_file = here("data/rankedlineage.dmp"), diff --git a/man/getProcessRuntimeWeights.Rd b/man/getProcessRuntimeWeights.Rd index ff3c8e5d..de0e2ea6 100644 --- a/man/getProcessRuntimeWeights.Rd +++ b/man/getProcessRuntimeWeights.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{getProcessRuntimeWeights} \alias{getProcessRuntimeWeights} -\title{Quickly get the runtime weights for MolEvolvR backend processes} +\title{getProcessRuntimeWeights} \usage{ getProcessRuntimeWeights(medians_yml_path = NULL) } diff --git a/man/mapAdvOption2Process.Rd b/man/mapAdvOption2Process.Rd index 5bd9ee65..6a210a20 100644 --- a/man/mapAdvOption2Process.Rd +++ b/man/mapAdvOption2Process.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{mapAdvOption2Process} \alias{mapAdvOption2Process} -\title{Use MolEvolvR advanced options to get associated processes} +\title{mapAdvOption2Process} \usage{ mapAdvOption2Process(advanced_opts) } diff --git a/man/mapOption2Process.Rd b/man/mapOption2Process.Rd index ff6905c5..9645617b 100644 --- a/man/mapOption2Process.Rd +++ b/man/mapOption2Process.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{mapOption2Process} \alias{mapOption2Process} -\title{Construct list where names (MolEvolvR advanced options) point to processes} +\title{mapOption2Process} \usage{ mapOption2Process() } diff --git a/man/plotEstimatedWallTimes.Rd b/man/plotEstimatedWallTimes.Rd index 0d53cb32..36b0ecd5 100644 --- a/man/plotEstimatedWallTimes.Rd +++ b/man/plotEstimatedWallTimes.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{plotEstimatedWallTimes} \alias{plotEstimatedWallTimes} -\title{Plot the estimated runtimes for different advanced options and number -of inputs} +\title{plotEstimatedWallTimes} \usage{ plotEstimatedWallTimes() } @@ -16,5 +15,8 @@ ggplot2::ggsave(filename = "/data/molevolvr_transfer/molevolvr_ dev/molevol_scripts/docs/estimate_walltimes.png", plot = p) } \description{ +Plot the estimated runtimes for different advanced options and number +of inputs + this function was just for fun; very, very messy code } diff --git a/man/writeProcessRuntime2TSV.Rd b/man/writeProcessRuntime2TSV.Rd index 03cbbd68..0e045a5c 100644 --- a/man/writeProcessRuntime2TSV.Rd +++ b/man/writeProcessRuntime2TSV.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{writeProcessRuntime2TSV} \alias{writeProcessRuntime2TSV} -\title{Write a table of 2 columns: 1) process and 2) median seconds} +\title{writeProcessRuntime2TSV} \usage{ writeProcessRuntime2TSV(dir_job_results, filepath) } diff --git a/man/writeProcessRuntime2YML.Rd b/man/writeProcessRuntime2YML.Rd index b43f39ee..865f23f7 100644 --- a/man/writeProcessRuntime2YML.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/assign_job_queue.R \name{writeProcessRuntime2YML} \alias{writeProcessRuntime2YML} -\title{Compute median process runtimes, then write a YAML list of the processes and -their median runtimes in seconds to the path specified by 'filepath'.} +\title{writeProcessRuntime2YML} \usage{ writeProcessRuntime2YML(dir_job_results, filepath = NULL) } @@ -14,6 +13,9 @@ writeProcessRuntime2YML(dir_job_results, filepath = NULL) uses ./molevol_scripts/log_data/job_proc_weights.yml} } \description{ +Compute median process runtimes, then write a YAML list of the processes and +their median runtimes in seconds to the path specified by 'filepath'. + The default value of filepath is the value of the env var MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default read location. From df602dfd63cbab0d84dbcc8229e3da9c7646b9d5 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 13:52:56 -0600 Subject: [PATCH 17/19] https://github.com/JRaviLab/MolEvolvR/pull/95/files#r1805272251 - re-implement dropped check - fix .Rd --- R/assign_job_queue.R | 5 ++++- man/writeProcessRuntime2YML.Rd | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R index 20ba841f..69609417 100644 --- a/R/assign_job_queue.R +++ b/R/assign_job_queue.R @@ -155,7 +155,7 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' their median runtimes in seconds to the path specified by 'filepath'. #' #' The default value of filepath is the value of the env var -#' MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default +#' MOLEVOLVR_PROC_WEIGHTS, which getProcessRuntimeWeights() also uses as its default #' read location. #' #' @param dir_job_results [chr] path to MolEvolvR job_results directory @@ -173,6 +173,9 @@ writeProcessRuntime2TSV <- function(dir_job_results, filepath) { #' } #' @export writeProcessRuntime2YML <- function(dir_job_results, filepath = NULL) { + if (is.null(filepath)) { + filepath <- file.path(common_root, "molevol_scripts", "log_data", "job_proc_weights.yml") + } medians <- calculateProcessRuntime(dir_job_results) yaml::write_yaml(medians, filepath) } diff --git a/man/writeProcessRuntime2YML.Rd b/man/writeProcessRuntime2YML.Rd index 865f23f7..5e0a05a4 100644 --- a/man/writeProcessRuntime2YML.Rd +++ b/man/writeProcessRuntime2YML.Rd @@ -17,7 +17,7 @@ Compute median process runtimes, then write a YAML list of the processes and their median runtimes in seconds to the path specified by 'filepath'. The default value of filepath is the value of the env var -MOLEVOLVR_PROC_WEIGHTS, which writeProcessRuntime2YML() also uses as its default +MOLEVOLVR_PROC_WEIGHTS, which getProcessRuntimeWeights() also uses as its default read location. } \examples{ From 1a0b66358eac637736a18868ae27e4049aa22628 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 14:43:47 -0600 Subject: [PATCH 18/19] https://github.com/JRaviLab/MolEvolvR/pull/95#discussion_r1805166466 - adjust roxygen skeleton readability --- R/acc2lin.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 61aae87c..7b6f570c 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -198,10 +198,8 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) { #' of an efetch run on the ipg database and #' #' @param accessions Character vector of protein accessions -#' @param ipg_file Filepath to the file -#' containing results of an efetch run on the -#' ipg database. The protein accession in -#' 'accessions' should be contained in this +#' @param ipg_file Filepath to the file containing results of an efetch run on the +#' ipg database. The protein accession in 'accessions' should be contained in this #' file #' @param assembly_path String of the path to the assembly_summary path #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function From 13e70c75a197c02c395cbef2d7b3c5b991ea7649 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 22 Oct 2024 15:02:39 -0600 Subject: [PATCH 19/19] formatting --- R/acc2lin.R | 8 ++------ man/efetchIPG.Rd | 3 +-- man/sinkReset.Rd | 1 - 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/R/acc2lin.R b/R/acc2lin.R index 7b6f570c..5f25afe2 100644 --- a/R/acc2lin.R +++ b/R/acc2lin.R @@ -10,7 +10,6 @@ #' Sink Reset #' #' @return No return, but run to close all outstanding `sink()`s -#' and handles any errors or warnings that occur during the process. #' #' @export #' @@ -87,8 +86,7 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path, #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function #' @param lineagelookup_path String of the path to the lineage lookup file #' (taxid to lineage mapping). This file can be generated using the -#' @param ipgout_path Path to write the results -#' of the efetch run of the accessions +#' @param ipgout_path Path to write the results of the efetch run of the accessions #' on the ipg database. If NULL, the file will not be written. Defaults to NULL #' @param plan A string specifying the parallelization strategy for the future #' package, such as `"sequential"` or `"multisession"`. @@ -122,9 +120,7 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_pa #' #' @author Samuel Chen, Janani Ravi #' -#' @description Perform efetch on the ipg database -#' and write the results to out_path -#' +#' @description Perform efetch on the ipg database and write the results to out_path #' @param accnums Character vector containing the accession numbers to query on #' the ipg database #' @param out_path Path to write the efetch results to diff --git a/man/efetchIPG.Rd b/man/efetchIPG.Rd index db63024f..047e2652 100644 --- a/man/efetchIPG.Rd +++ b/man/efetchIPG.Rd @@ -23,8 +23,7 @@ the ipg database} No return value. The function writes the fetched results to \code{out_path}. } \description{ -Perform efetch on the ipg database -and write the results to out_path +Perform efetch on the ipg database and write the results to out_path Perform efetch on the ipg database and write the results to out_path } diff --git a/man/sinkReset.Rd b/man/sinkReset.Rd index e3fc7ce4..0285c0b2 100644 --- a/man/sinkReset.Rd +++ b/man/sinkReset.Rd @@ -8,7 +8,6 @@ sinkReset() } \value{ No return, but run to close all outstanding \code{sink()}s -and handles any errors or warnings that occur during the process. } \description{ Sink Reset