Skip to content

Commit

Permalink
formating
Browse files Browse the repository at this point in the history
  • Loading branch information
SilasK committed Jul 26, 2023
1 parent 25bebfb commit 44f1e86
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 109 deletions.
75 changes: 29 additions & 46 deletions atlas/sample_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,10 @@ def validate_sample_table(sampleTable):
)
exit(1)


### Validate BinGroup

if sampleTable.BinGroup.isnull().any():
logger.warning(
f"Found empty values in the sample table column 'BinGroup'"
)


logger.warning(f"Found empty values in the sample table column 'BinGroup'")

if sampleTable.BinGroup.str.contains("_").any():
logger.error(
Expand All @@ -65,103 +60,91 @@ def validate_sample_table(sampleTable):
exit(1)





def load_sample_table(sample_table="samples.tsv"):
sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t")
validate_sample_table(sampleTable)
return sampleTable


def validate_bingroup_size_cobinning(sampleTable,logger):
def validate_bingroup_size_cobinning(sampleTable, logger):
"""
Validate that the bingroups are not too large, nor too small for co-binning.
e.g. vamb and SemiBin
"""

bin_group_sizes = sampleTable.BinGroup.value_counts()

bin_group_sizes = sampleTable.BinGroup.value_counts()

if bin_group_sizes.max() > 200:
logger.warning(
f"Found a bin group with more than 250 samples. This might lead to memory issues. \n {bin_group_sizes}"
)

if bin_group_sizes.min() <=5:
if bin_group_sizes.min() <= 5:
logger.error(
"If you want to use co-binning, you should have at least 5-10 samples per bin group. \n"
)
exit(1)


def validate_bingroup_size_metabat(sampleTable,logger):
def validate_bingroup_size_metabat(sampleTable, logger):

bin_group_sizes = sampleTable.BinGroup.value_counts()

max_bin_group_size = bin_group_sizes.max()

warn_message= ( "Co-binning with metabat uses cross-mapping which scales quadratically."
f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings."
warn_message = (
"Co-binning with metabat uses cross-mapping which scales quadratically."
f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings."
)



if max_bin_group_size > 50:
logger.error( warn_message + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups." )
logger.error(
warn_message
+ "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups."
)
exit(1)

if max_bin_group_size > 10:
logger.warning( warn_message + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups.")

elif max_bin_group_size ==1:

logger.warning("You have only one sample per bingroup. This doesn't use the co-abundance information.")


logger.warning(
warn_message
+ "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups."
)

elif max_bin_group_size == 1:

def validate_bingroup_size(sample, config,logger):
logger.warning(
"You have only one sample per bingroup. This doesn't use the co-abundance information."
)


def validate_bingroup_size(sample, config, logger):

if config["final_binner"]=="DASTool":
if config["final_binner"] == "DASTool":

binners = config["binner"]

logger.info(f"DASTool uses the folowing binners: {binners}")

if ("vamb" in binners) or ("SemiBin" in binners):

validate_bingroup_size_cobinning(sample,logger)
validate_bingroup_size_cobinning(sample, logger)

if "metabat" in binners:

validate_bingroup_size_metabat(sample,logger)

elif config["final_binner"]=="metabat":
validate_bingroup_size_metabat(sample, logger)

validate_bingroup_size_metabat(sample,logger)
elif config["final_binner"] == "metabat":

elif config["final_binner"] in ["vamb","SemiBin"]:
validate_bingroup_size_metabat(sample, logger)

validate_bingroup_size_cobinning(sample,logger)
elif config["final_binner"] in ["vamb", "SemiBin"]:

validate_bingroup_size_cobinning(sample, logger)

elif config["final_binner"] == "maxbin":

logger.warning("maxbin Doesn't use coabundance for binning.")

else:
logger.error(f"Unknown final binner: {config['final_binner']}")










4 changes: 1 addition & 3 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,10 @@ min_version("6.1")
container: "docker://continuumio/miniconda3:4.4.10"





wildcard_constraints:
binner="[A-Za-z]+",


include: "rules/sample_table.smk"
include: "rules/download.smk" # contains hard coded variables
include: "rules/qc.smk"
Expand Down
5 changes: 0 additions & 5 deletions workflow/rules/assemble.smk
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,6 @@ rule rename_contigs:
" minscaf={params.minlength} &> {log} "



if config["filter_contigs"]:

ruleorder: align_reads_to_prefilter_contigs > align_reads_to_final_contigs
Expand Down Expand Up @@ -604,8 +603,6 @@ rule finalize_contigs:
os.symlink(os.path.relpath(input[0], os.path.dirname(output[0])), output[0])




rule calculate_contigs_stats:
input:
"{sample}/{sample}_contigs.fasta",
Expand All @@ -623,8 +620,6 @@ rule calculate_contigs_stats:
"stats.sh in={input} format=3 out={output} &> {log}"




# generalized rule so that reads from any "sample" can be aligned to contigs from "sample_contigs"
rule align_reads_to_final_contigs:
input:
Expand Down
35 changes: 19 additions & 16 deletions workflow/rules/cobinning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,40 @@ def get_samples_of_bingroup(wildcards):

return sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()


def get_filtered_contigs_of_bingroup(wildcards):

samples_of_group = get_samples_of_bingroup(wildcards)


if len(samples_of_group) <= 5:
raise ValueError(f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples."
"For cobinning we reccomend at least 5 samples per bin group."
"Adapt the sample.tsv to set BinGroup of size [5- 1000]"
)
raise ValueError(
f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples."
"For cobinning we reccomend at least 5 samples per bin group."
"Adapt the sample.tsv to set BinGroup of size [5- 1000]"
)


return {
#Trigers rerun if contigs change
# Trigers rerun if contigs change
"flag": expand("{sample}/{sample}_contigs.fasta", sample=samples_of_group),
"fasta": ancient(expand(rules.filter_contigs.output[0], sample=samples_of_group)),
"fasta": ancient(
expand(rules.filter_contigs.output[0], sample=samples_of_group)
),
}


def get_bams_of_bingroup(wildcards):

samples_of_group = get_samples_of_bingroup(wildcards)

return expand("Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", sample= samples_of_group)
return expand(
"Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam",
sample=samples_of_group,
)


rule combine_contigs:
input:
unpack(get_filtered_contigs_of_bingroup),

output:
"Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz",
log:
Expand Down Expand Up @@ -166,7 +170,7 @@ rule sort_bam:

rule summarize_bam_contig_depths:
input:
bams= get_bams_of_bingroup,
bams=get_bams_of_bingroup,
output:
"Intermediate/cobinning/{bingroup}/coverage.jgi.tsv",
log:
Expand Down Expand Up @@ -200,7 +204,7 @@ rule convert_jgi2vamb_coverage:

rule run_vamb:
input:
coverage= "Intermediate/cobinning/{bingroup}/coverage.tsv",
coverage="Intermediate/cobinning/{bingroup}/coverage.tsv",
fasta=rules.combine_contigs.output,
output:
directory("Intermediate/cobinning/vamb_{bingroup}"),
Expand Down Expand Up @@ -228,7 +232,6 @@ rule run_vamb:
"2> {log}"



vamb_cluster_attribution_path = "{sample}/binning/vamb/cluster_attribution.tsv"


Expand All @@ -238,16 +241,16 @@ localrules:

rule parse_vamb_output:
input:
expand(rules.run_vamb.output, bingroup = sampleTable.BinGroup.unique()),
expand(rules.run_vamb.output, bingroup=sampleTable.BinGroup.unique()),
output:
renamed_clusters="Intermediate/cobinning/vamb_clusters.tsv.gz",
cluster_atributions= expand(vamb_cluster_attribution_path, sample=SAMPLES),
cluster_atributions=expand(vamb_cluster_attribution_path, sample=SAMPLES),
log:
"logs/cobinning/vamb_parse_output.log",
params:
separator=config["cobinning_separator"],
fasta_extension=".fna",
output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace
output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace
samples=SAMPLES,
conda:
"../envs/fasta.yaml"
Expand Down
9 changes: 4 additions & 5 deletions workflow/rules/sample_table.smk
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@


from atlas.sample_table import load_sample_table,validate_bingroup_size
from atlas.sample_table import load_sample_table, validate_bingroup_size

sampleTable = load_sample_table()
validate_bingroup_size(sampleTable,config,logger)
validate_bingroup_size(sampleTable, config, logger)


def io_params_for_tadpole(io, key="in"):
"""This function generates the input flag needed for bbwrap/tadpole for all cases
Expand Down Expand Up @@ -183,4 +182,4 @@ def get_quality_controlled_reads(wildcards, include_se=False):


def get_assembly(wildcards):
return "{sample}/assembly/{sample}_contigs.fasta".format(sample=wildcards.sample )
return "{sample}/assembly/{sample}_contigs.fasta".format(sample=wildcards.sample)
51 changes: 34 additions & 17 deletions workflow/rules/semibin.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ rule semibin_generate_data_multi:
fasta=rules.combine_contigs.output,
bams=get_bams_of_bingroup,
output:
directory("Intermediate/cobinning/{bingroup}/semibin/data_multi")
directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"),
# expand(
# "Cobinning/SemiBin/samples/{sample}/{files}",
# sample=SAMPLES,
Expand Down Expand Up @@ -35,10 +35,10 @@ rule semibin_generate_data_multi:

rule semibin_train:
input:
flag = "{sample}/{sample}_contigs.fasta",
fasta_sample = rules.filter_contigs.output,
bams= get_bams_of_bingroup,
data_folder= rules.semibin_generate_data_multi.output[0],
flag="{sample}/{sample}_contigs.fasta",
fasta_sample=rules.filter_contigs.output,
bams=get_bams_of_bingroup,
data_folder=rules.semibin_generate_data_multi.output[0],
output:
"Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5",
conda:
Expand All @@ -53,8 +53,14 @@ rule semibin_train:
"logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv"
params:
output_dir=lambda wc, output: os.path.dirname(output[0]),
data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv",
data_split = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data_split.csv",
data=lambda wc, input: Path(input.data_folder)
/ "samples"
/ wc.sample
/ "data.csv",
data_split=lambda wc, input: Path(input.data_folder)
/ "samples"
/ wc.sample
/ "data_split.csv",
extra=config["semibin_train_extra"],
shell:
"SemiBin train_self "
Expand All @@ -69,23 +75,31 @@ rule semibin_train:
def semibin_input(wildcards):

bingroup_of_sample = sampleTable.loc[wildcards.sample, "bingroup"]
samples_of_bingroup = sampleTable.query(f'BinGroup=="{bingroup_of_sample}"').index.tolist()

samples_of_bingroup = sampleTable.query(
f'BinGroup=="{bingroup_of_sample}"'
).index.tolist()

return dict(
flag= "{sample}/{sample}_contigs.fasta",
fasta = rules.filter_contigs.output,
bams =lambda wc: expand(rules.sort_bam.output, sample= samples_of_bingroup),
data_folder = rules.semibin_generate_data_multi.output[0].format(bingroup=bingroup_of_sample),
model = rules.semibin_train.output[0].format(bingroup=bingroup_of_sample, sample=wildcards.sample),
flag="{sample}/{sample}_contigs.fasta",
fasta=rules.filter_contigs.output,
bams=lambda wc: expand(rules.sort_bam.output, sample=samples_of_bingroup),
data_folder=rules.semibin_generate_data_multi.output[0].format(
bingroup=bingroup_of_sample
),
model=rules.semibin_train.output[0].format(
bingroup=bingroup_of_sample, sample=wildcards.sample
),
)


rule run_semibin:
input:
unpack(semibin_input),
output:
# contains no info to bingroup
directory("Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"),
directory(
"Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"
),
conda:
"../envs/semibin.yaml"
threads: config["threads"]
Expand All @@ -97,8 +111,11 @@ rule run_semibin:
benchmark:
"logs/benchmarks/semibin/bin/{sample}.tsv"
params:
output_dir= lambda wc, output: os.path.dirname(output[0]),
data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv",
output_dir=lambda wc, output: os.path.dirname(output[0]),
data=lambda wc, input: Path(input.data_folder)
/ "samples"
/ wc.sample
/ "data.csv",
min_bin_kbs=int(config["cobining_min_bin_size"] / 1000),
extra=config["semibin_options"],
shell:
Expand Down
Loading

0 comments on commit 44f1e86

Please sign in to comment.