Skip to content

Commit

Permalink
Merge commit '8e9901b0db9c870469a9ae58f42becc5a0249b93'
Browse files Browse the repository at this point in the history
  • Loading branch information
the-mayer committed Oct 22, 2024
2 parents b665a4d + 8e9901b commit 1c6aaa7
Show file tree
Hide file tree
Showing 29 changed files with 480 additions and 403 deletions.
31 changes: 16 additions & 15 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@ export(addLeaves2Alignment)
export(addLineage)
export(addName)
export(addTaxID)
export(advanced_opts2est_walltime)
export(alignFasta)
export(assign_job_queue)
export(assignJobQueue)
export(calculateEstimatedWallTimeFromOpts)
export(calculateProcessRuntime)
export(cleanClusters)
export(cleanDomainArchitecture)
export(cleanGeneDescription)
export(cleanGenomicContext)
export(cleanLineage)
export(cleanSpecies)
export(combine_files)
export(combine_full)
export(combine_ipr)
export(combineFiles)
export(combineFullAnalysis)
export(combineIPR)
export(condenseRepeatedDomains)
export(convert2TitleCase)
export(convertAlignment2FA)
Expand All @@ -35,11 +36,11 @@ export(createJobResultsURL)
export(createJobStatusEmailMessage)
export(createMSA_Kalign)
export(createMSA_PDF)
export(createLineageLookup)
export(createRepresentativeAccNum)
export(createUndirectedGenomicContextNetwork)
export(createWordCloud2Element)
export(createWordCloudElement)
export(create_lineage_lookup)
export(downloadAssemblySummary)
export(efetchIPG)
export(extractAccNum)
Expand All @@ -49,12 +50,13 @@ export(findParalogs)
export(formatJobArgumentsHTML)
export(generateAllAlignments2FA)
export(getAccNumFromFA)
export(getProcessRuntimeWeights)
export(getTopAccByLinDomArch)
export(get_proc_medians)
export(get_proc_weights)
export(make_opts2procs)
export(mapAcc2Name)
export(map_advanced_opts2procs)
export(mapAdvOption2Process)
export(mapOption2Process)
export(msa_pdf)
export(plotEstimatedWallTimes)
export(plotIPR2Viz)
export(plotIPR2VizWeb)
export(plotLineageDA)
Expand All @@ -67,7 +69,6 @@ export(plotStackedLineage)
export(plotSunburst)
export(plotTreemap)
export(plotUpSet)
export(plot_estimated_walltimes)
export(prepareColumnParams)
export(prepareSingleColumnParams)
export(proteinAcc2TaxID)
Expand All @@ -79,8 +80,8 @@ export(renameFA)
export(rename_fasta)
export(replaceQuestionMarks)
export(reverseOperonSeq)
export(run_deltablast)
export(run_rpsblast)
export(runDeltaBlast)
export(runRPSBlast)
export(selectLongestDuplicate)
export(sendJobStatusEmail)
export(shortenLineage)
Expand All @@ -97,8 +98,8 @@ export(totalGenContextOrDomArchCounts)
export(validateCountDF)
export(wordcloud3)
export(writeMSA_AA2FA)
export(write_proc_medians_table)
export(write_proc_medians_yml)
export(writeProcessRuntime2TSV)
export(writeProcessRuntime2YML)
importFrom(Biostrings,AAStringSet)
importFrom(Biostrings,readAAStringSet)
importFrom(Biostrings,toString)
Expand Down
106 changes: 58 additions & 48 deletions R/acc2lin.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#' Sink Reset
#'
#' @return No return, but run to close all outstanding `sink()`s
#'
#' @export
#'
#' @examples
Expand All @@ -18,7 +19,7 @@
#' }
sinkReset <- function() {
for (i in seq_len(sink.number())) {
sink(NULL)
sink(NULL)
}
}

Expand Down Expand Up @@ -56,18 +57,20 @@ addLineage <- function(df, acc_col = "AccNum", assembly_path,
accessions <- df %>% pull(acc_col)
lins <- acc2Lineage(accessions, assembly_path, lineagelookup_path, ipgout_path, plan)

# Drop a lot of the unimportant columns for now? will make merging much easier
lins <- lins[, c(
# Drop a lot of the unimportant columns for now?
# will make merging much easier
lins <- lins[, c(
"Strand", "Start", "Stop", "Nucleotide Accession", "Source",
"Id", "Strain"
) := NULL]
lins <- unique(lins)
) := NULL]
lins <- unique(lins)

# dup <- lins %>% group_by(Protein) %>% summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)
# dup <- lins %>% group_by(Protein) %>%
# summarize(count = n()) %>% filter(count > 1) %>%
# pull(Protein)

merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
merged <- merge(df, lins, by.x = acc_col, by.y = "Protein", all.x = TRUE)
return(merged)
}


Expand Down Expand Up @@ -107,18 +110,17 @@ acc2Lineage <- function(accessions, assembly_path, lineagelookup_path, ipgout_pa

lins <- IPG2Lineage(accessions, ipgout_path, assembly_path, lineagelookup_path)

if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
if (tmp_ipg) {
unlink(tempdir(), recursive = T)
}
return(lins)
}

#' efetchIPG
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Perform efetch on the ipg database and write the results to out_path
#'
#' @param accnums Character vector containing the accession numbers to query on
#' the ipg database
#' @param out_path Path to write the efetch results to
Expand All @@ -144,46 +146,52 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
# limit of 10/second w/ key
l <- length(in_data)

partitioned <- list()
for (i in 1:groups)
{
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}

plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)
partitioned <- list()
for (i in 1:groups){
partitioned[[i]] <- in_data[seq.int(i, l, groups)]
}

return(partitioned)
}

# Set the future plan strategy
plan(strategy = plan, .skip = T)


min_groups <- length(accnums) / 200
groups <- min(max(min_groups, 15), length(accnums))
partitioned_acc <- partition(accnums, groups)

# Open the sink to the output path
sink(out_path)

a <- future_map(1:length(partitioned_acc), function(x) {
# Avoid hitting the rate API limit
if (x %% 9 == 0) {
Sys.sleep(1)
}
cat(
entrez_fetch(
id = partitioned_acc[[x]],
db = "ipg",
rettype = "xml",
api_key = "YOUR_KEY_HERE" ## Can this be included in public package?
)
)
})
sink(NULL)

}
}



#' IPG2Lineage
#'
#' @author Samuel Chen, Janani Ravi
#'
#' @description Takes the resulting file of an efetch run on the ipg database and
#' @description Takes the resulting file
#' of an efetch run on the ipg database and
#'
#' @param accessions Character vector of protein accessions
#' @param ipg_file Filepath to the file containing results of an efetch run on the
Expand All @@ -193,7 +201,7 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
#' This file can be generated using the \link[MolEvolvR]{downloadAssemblySummary} function
#' @param lineagelookup_path String of the path to the lineage lookup file
#' (taxid to lineage mapping). This file can be generated using the
#' "create_lineage_lookup()" function
#' "createLineageLookup()" function
#'
#' @importFrom data.table fread
#'
Expand All @@ -209,8 +217,10 @@ efetchIPG <- function(accnums, out_path, plan = "sequential", ...) {
IPG2Lineage <- function(accessions, ipg_file, assembly_path, lineagelookup_path, ...) {
ipg_dt <- fread(ipg_file, sep = "\t", fill = T)

# Filter the IPG data table to only include the accessions
ipg_dt <- ipg_dt[Protein %in% accessions]

# Rename the 'Assembly' column to 'GCA_ID'
ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")

lins <- GCA2Lineage(prot_data = ipg_dt, assembly_path, lineagelookup_path)
Expand Down
Loading

0 comments on commit 1c6aaa7

Please sign in to comment.