Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Function names rename for R/combine_analysis.R, R/combine_files.R, R/create_lineage_lookup.R, R/assign_job_queue.R #95

Merged
merged 22 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
30d4bf3
usethis::pr_init("Implement error handling in acc2lin.R functions
Seyi007 Oct 5, 2024
4aeaa11
Add error handling to multiple functions
Seyi007 Oct 7, 2024
8798c2f
Merge branch 'main' into rename_functions
Seyi007 Oct 7, 2024
091d32e
fixing merge issue in NAMESPACE
Seyi007 Oct 8, 2024
fc63187
Added updated function name to NAMESPACE and removed unused argument …
Seyi007 Oct 8, 2024
38f3cb0
added error handling functionality for the run_deltablast and run_rps…
Seyi007 Oct 9, 2024
4ff68fb
Reverting to old function names for the following functions to create…
Seyi007 Oct 10, 2024
035c5e1
minor updates to namespace and Rd files after running devtool::check()
Seyi007 Oct 10, 2024
fb5ac23
Renamed the following function;
Seyi007 Oct 10, 2024
106eb14
reverting to old function names; make_opts2procs, map_advanced_opts2p…
Seyi007 Oct 10, 2024
e8e96bd
Getting reverted function names changes
Seyi007 Oct 10, 2024
a543898
Renamed the following functions in R/assign_job_queue.R;
Seyi007 Oct 10, 2024
2781858
Merge commit '1c43150ce12157e3a2caf64178ce437f1db82a1c'
the-mayer Oct 12, 2024
e946061
remove outdated .Rd
the-mayer Oct 12, 2024
9571333
let R sort NAMESPACE
the-mayer Oct 12, 2024
8c57369
regen new .Rd
the-mayer Oct 12, 2024
2061d7a
remove old tryCatch code (for now)
the-mayer Oct 12, 2024
70f0de8
remove code not relevant to PR
the-mayer Oct 14, 2024
392775d
adjust .Rd title tags for renamed functions
the-mayer Oct 14, 2024
df602df
https://github.com/JRaviLab/MolEvolvR/pull/95/files#r1805272251
the-mayer Oct 22, 2024
1a0b663
https://github.com/JRaviLab/MolEvolvR/pull/95#discussion_r1805166466
the-mayer Oct 22, 2024
13e70c7
formatting
the-mayer Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@ export(addLeaves2Alignment)
export(addLineage)
export(addName)
export(addTaxID)
export(advanced_opts2est_walltime)
export(alignFasta)
export(assign_job_queue)
export(assignJobQueue)
export(calculateEstimatedWallTimeFromOpts)
export(calculateProcessRuntime)
export(cleanClusters)
export(cleanDomainArchitecture)
export(cleanGeneDescription)
export(cleanGenomicContext)
export(cleanLineage)
export(cleanSpecies)
export(combine_files)
export(combine_full)
export(combine_ipr)
export(combineFiles)
export(combineFullAnalysis)
export(combineIPR)
export(condenseRepeatedDomains)
export(convert2TitleCase)
export(convertAlignment2FA)
Expand All @@ -32,10 +33,10 @@ export(countByColumn)
export(createFA2Tree)
export(createJobResultsURL)
export(createJobStatusEmailMessage)
export(createLineageLookup)
export(createRepresentativeAccNum)
export(createWordCloud2Element)
export(createWordCloudElement)
export(create_lineage_lookup)
export(domain_network)
export(downloadAssemblySummary)
export(efetchIPG)
Expand All @@ -48,13 +49,13 @@ export(gc_undirected_network)
export(generateAllAlignments2FA)
export(generate_msa)
export(getAccNumFromFA)
export(getProcessRuntimeWeights)
export(getTopAccByLinDomArch)
export(get_proc_medians)
export(get_proc_weights)
export(make_opts2procs)
export(mapAcc2Name)
export(map_advanced_opts2procs)
export(mapAdvOption2Process)
export(mapOption2Process)
export(msa_pdf)
export(plotEstimatedWallTimes)
export(plotIPR2Viz)
export(plotIPR2VizWeb)
export(plotLineageDA)
Expand All @@ -67,7 +68,6 @@ export(plotStackedLineage)
export(plotSunburst)
export(plotTreemap)
export(plotUpSet)
export(plot_estimated_walltimes)
export(prepareColumnParams)
export(prepareSingleColumnParams)
export(proteinAcc2TaxID)
Expand Down Expand Up @@ -97,8 +97,8 @@ export(totalGenContextOrDomArchCounts)
export(validateCountDF)
export(wordcloud3)
export(writeMSA_AA2FA)
export(write_proc_medians_table)
export(write_proc_medians_yml)
export(writeProcessRuntime2TSV)
export(writeProcessRuntime2YML)
importFrom(Biostrings,AAStringSet)
importFrom(Biostrings,readAAStringSet)
importFrom(Biostrings,toString)
Expand Down
118 changes: 67 additions & 51 deletions R/acc2lin.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#' Sink Reset
#'
#' @return No return, but run to close all outstanding `sink()`s
#' and handles any errors or warnings that occur during the process.
#'
#' @export
#'
#' @examples
Expand All @@ -18,7 +20,7 @@
#' }
sinkReset <- function() {
for (i in seq_len(sink.number())) {
sink(NULL)
sink(NULL)
}
}

Expand Down Expand Up @@ -56,18 +58,20 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path,
accessions <- df %>% pull(acc_col)
lins <- acc2Lineage(accessions, assembly_path, lineagelookup_path, ipgout_path, plan)

# Drop a lot of the unimportant columns for now? will make merging much easier
lins <- lins[, c(
# Drop a lot of the unimportant columns for now?
# will make merging much easier
lins <- lins[, c(
"Strand", "Start", "Stop", "Nucleotide Accession", "Source",
"Id", "Strain"
) := NULL]
lins <- unique(lins)
) := NULL]
lins <- unique(lins)

# dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)
# dup <- lins %>% group_by(Protein) %>%
# summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)

merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
}


Expand All @@ -83,7 +87,8 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path,
#' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function
#' @param lineagelookup_path String of the path to the lineage lookup file
#' (taxid to lineage mapping). This file can be generated using the
#' @param ipgout_path Path to write the results of the efetch run of the accessions
#' @param ipgout_path Path to write the results
#' of the efetch run of the accessions
#' on the ipg database. If NULL, the file will not be written. Defaults to NULL
#' @param plan A string specifying the parallelization strategy for the future
#' package, such as `"sequential"` or `"multisession"`.
Expand All @@ -107,17 +112,18 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_pa

lins <- IPG2Lineage(accessions, ipgout_path, assembly_path, lineagelookup_path)

if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
}

#' efetchIPG
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Perform efetch on the ipg database and write the results to out_path
#' @description Perform efetch on the ipg database
#' and write the results to out_path
#'
#' @param accnums Character vector containing the accession numbers to query on
#' the ipg database
Expand All @@ -144,56 +150,64 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
# limit of 10/second w/ key
l <- length(in_data)

partitioned <- list()
for (i in 1:groups)
{
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}

plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)
partitioned <- list()
for (i in 1:groups){
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}

# Set the future plan strategy
plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)

# Open the sink to the output path
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)

}
}



#' IPG2Lineage
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Takes the resulting file of an efetch run on the ipg database and
#' @description Takes the resulting file
#' of an efetch run on the ipg database and
#'
#' @param accessions Character vector of protein accessions
#' @param ipg_file Filepath to the file containing results of an efetch run on the
#' ipg database. The protein accession in 'accessions' should be contained in this
#' @param ipg_file Filepath to the file
#' containing results of an efetch run on the
#' ipg database. The protein accession in
#' 'accessions' should be contained in this
the-mayer marked this conversation as resolved.
Show resolved Hide resolved
#' file
#' @param assembly_path String of the path to the assembly_summary path
#' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function
#' @param lineagelookup_path String of the path to the lineage lookup file
#' (taxid to lineage mapping). This file can be generated using the
#' "create_lineage_lookup()" function
#' "createLineageLookup()" function
#'
#' @importFrom data.table fread
#'
Expand All @@ -209,8 +223,10 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, ...) {
ipg_dt <- fread(ipg_file, sep = "\t", fill = T)

# Filter the IPG data table to only include the accessions
ipg_dt <- ipg_dt[Protein %in% accessions]

# Rename the 'Assembly' column to 'GCA_ID'
ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")

lins <- GCA2Lineage(prot_data = ipg_dt, assembly_path, lineagelookup_path)
Expand Down
Loading
Loading