JRaviLab · the-mayer · Oct 22, 2024 · Oct 5, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,18 +11,19 @@ export(addLeaves2Alignment)
 export(addLineage)
 export(addName)
 export(addTaxID)
-export(advanced_opts2est_walltime)
 export(alignFasta)
-export(assign_job_queue)
+export(assignJobQueue)
+export(calculateEstimatedWallTimeFromOpts)
+export(calculateProcessRuntime)
 export(cleanClusters)
 export(cleanDomainArchitecture)
 export(cleanGeneDescription)
 export(cleanGenomicContext)
 export(cleanLineage)
 export(cleanSpecies)
-export(combine_files)
-export(combine_full)
-export(combine_ipr)
+export(combineFiles)
+export(combineFullAnalysis)
+export(combineIPR)
 export(condenseRepeatedDomains)
 export(convert2TitleCase)
 export(convertAlignment2FA)
@@ -32,10 +33,10 @@ export(countByColumn)
 export(createFA2Tree)
 export(createJobResultsURL)
 export(createJobStatusEmailMessage)
+export(createLineageLookup)
 export(createRepresentativeAccNum)
 export(createWordCloud2Element)
 export(createWordCloudElement)
-export(create_lineage_lookup)
 export(domain_network)
 export(downloadAssemblySummary)
 export(efetchIPG)
@@ -48,13 +49,13 @@ export(gc_undirected_network)
 export(generateAllAlignments2FA)
 export(generate_msa)
 export(getAccNumFromFA)
+export(getProcessRuntimeWeights)
 export(getTopAccByLinDomArch)
-export(get_proc_medians)
-export(get_proc_weights)
-export(make_opts2procs)
 export(mapAcc2Name)
-export(map_advanced_opts2procs)
+export(mapAdvOption2Process)
+export(mapOption2Process)
 export(msa_pdf)
+export(plotEstimatedWallTimes)
 export(plotIPR2Viz)
 export(plotIPR2VizWeb)
 export(plotLineageDA)
@@ -67,7 +68,6 @@ export(plotStackedLineage)
 export(plotSunburst)
 export(plotTreemap)
 export(plotUpSet)
-export(plot_estimated_walltimes)
 export(prepareColumnParams)
 export(prepareSingleColumnParams)
 export(proteinAcc2TaxID)
@@ -97,8 +97,8 @@ export(totalGenContextOrDomArchCounts)
 export(validateCountDF)
 export(wordcloud3)
 export(writeMSA_AA2FA)
-export(write_proc_medians_table)
-export(write_proc_medians_yml)
+export(writeProcessRuntime2TSV)
+export(writeProcessRuntime2YML)
 importFrom(Biostrings,AAStringSet)
 importFrom(Biostrings,readAAStringSet)
 importFrom(Biostrings,toString)

diff --git a/R/acc2lin.R b/R/acc2lin.R
@@ -10,6 +10,8 @@
 #' Sink Reset
 #'
 #' @return No return, but run to close all outstanding `sink()`s
+#'         and handles any errors or warnings that occur during the process.
+#'
 #' @export
 #'
 #' @examples
@@ -18,7 +20,7 @@
 #' }
 sinkReset <- function() {
     for (i in seq_len(sink.number())) {
-        sink(NULL)
+      sink(NULL)
     }
 }
 
@@ -56,18 +58,20 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path,
     accessions <- df %>% pull(acc_col)
     lins <- acc2Lineage(accessions, assembly_path, lineagelookup_path, ipgout_path, plan)
 
-    # Drop a lot of the unimportant columns for now? will make merging much easier
-    lins <- lins[, c(
+      # Drop a lot of the unimportant columns for now? 
+      # will make merging much easier
+      lins <- lins[, c(
         "Strand", "Start", "Stop", "Nucleotide Accession", "Source",
         "Id", "Strain"
-    ) := NULL]
-    lins <- unique(lins)
+      ) := NULL]
+      lins <- unique(lins)
 
-    # dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>%
-    #   pull(Protein)
+      # dup <- lins %>% group_by(Protein) %>% 
+      # summarize(count = n()) %>% filter(count > 1) %>%
+      # pull(Protein)
 
-    merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
-    return(merged)
+      merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
+      return(merged)
 }
 
 
@@ -83,7 +87,8 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path,
 #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function
 #' @param lineagelookup_path String of the path to the lineage lookup file
 #' (taxid to lineage mapping). This file can be generated using the
-#' @param ipgout_path Path to write the results of the efetch run of the accessions
+#' @param ipgout_path Path to write the results 
+#'                    of the efetch run of the accessions
 #' on the ipg database. If NULL, the file will not be written. Defaults to NULL
 #' @param plan A string specifying the parallelization strategy for the future
 #' package, such as `"sequential"` or `"multisession"`.
@@ -107,17 +112,18 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_pa
 
     lins <- IPG2Lineage(accessions, ipgout_path, assembly_path, lineagelookup_path)
 
-    if (tmp_ipg) {
-        unlink(tempdir(), recursive = T)
-    }
-    return(lins)
+  if (tmp_ipg) {
+    unlink(tempdir(), recursive = T)
+  }
+  return(lins)
 }
 
 #' efetchIPG
 #'
 #' @author Samuel Chen, Janani Ravi
 #'
-#' @description Perform efetch on the ipg database and write the results to out_path
+#' @description Perform efetch on the ipg database
+#'              and write the results to out_path
 #'
 #' @param accnums Character vector containing the accession numbers to query on
 #' the ipg database
@@ -144,56 +150,64 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
             # limit of 10/second w/ key
             l <- length(in_data)
 
-            partitioned <- list()
-            for (i in 1:groups)
-            {
-                partitioned[[i]] <- in_data[seq.int(i, l, groups)]
-            }
-
-            return(partitioned)
-        }
-
-        plan(strategy = plan, .skip = T)
-
-
-        min_groups <- length(accnums) / 200
-        groups <- min(max(min_groups, 15), length(accnums))
-        partitioned_acc <- partition(accnums, groups)
-        sink(out_path)
-
-        a <- future_map(1:length(partitioned_acc), function(x) {
-            # Avoid hitting the rate API limit
-            if (x %% 9 == 0) {
-                Sys.sleep(1)
-            }
-            cat(
-                entrez_fetch(
-                    id = partitioned_acc[[x]],
-                    db = "ipg",
-                    rettype = "xml",
-                    api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
-                )
-            )
-        })
-        sink(NULL)
+      partitioned <- list()
+      for (i in 1:groups){
+        partitioned[[i]] <- in_data[seq.int(i, l, groups)]
+      }
+
+      return(partitioned)
     }
+
+    # Set the future plan strategy
+    plan(strategy = plan, .skip = T)
+
+
+    min_groups <- length(accnums) / 200
+    groups <- min(max(min_groups, 15), length(accnums))
+    partitioned_acc <- partition(accnums, groups)
+
+    # Open the sink to the output path
+    sink(out_path)
+
+    a <- future_map(1:length(partitioned_acc), function(x) {
+      # Avoid hitting the rate API limit
+      if (x %% 9 == 0) {
+        Sys.sleep(1)
+      }
+      cat(
+        entrez_fetch(
+          id = partitioned_acc[[x]],
+          db = "ipg",
+          rettype = "xml",
+          api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
+        )
+      )
+    })
+    sink(NULL)
+
+  }
 }
 
+
+
 #' IPG2Lineage
 #'
 #' @author Samuel Chen, Janani Ravi
 #'
-#' @description Takes the resulting file of an efetch run on the ipg database and
+#' @description Takes the resulting file
+#'              of an efetch run on the ipg database and
 #'
 #' @param accessions Character vector of protein accessions
-#' @param ipg_file Filepath to the file containing results of an efetch run on the
-#' ipg database. The protein accession in 'accessions' should be contained in this
+#' @param ipg_file Filepath to the file
+#'                 containing results of an efetch run on the
+#' ipg database. The protein accession in
+#'               'accessions' should be contained in this
 #' file
 #' @param assembly_path String of the path to the assembly_summary path
 #' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function
 #' @param lineagelookup_path String of the path to the lineage lookup file
 #' (taxid to lineage mapping). This file can be generated using the
-#' "create_lineage_lookup()" function
+#' "createLineageLookup()" function
 #'
 #' @importFrom data.table fread
 #'
@@ -209,8 +223,10 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
 IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, ...) {
     ipg_dt <- fread(ipg_file, sep = "\t", fill = T)
 
+    # Filter the IPG data table to only include the accessions
     ipg_dt <- ipg_dt[Protein %in% accessions]
 
+    # Rename the 'Assembly' column to 'GCA_ID'
     ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")
 
     lins <- GCA2Lineage(prot_data = ipg_dt, assembly_path, lineagelookup_path)