diff --git a/atlas/sample_table.py b/atlas/sample_table.py index aa3a899d..8b9f9558 100644 --- a/atlas/sample_table.py +++ b/atlas/sample_table.py @@ -42,15 +42,10 @@ def validate_sample_table(sampleTable): ) exit(1) - ### Validate BinGroup if sampleTable.BinGroup.isnull().any(): - logger.warning( - f"Found empty values in the sample table column 'BinGroup'" - ) - - + logger.warning(f"Found empty values in the sample table column 'BinGroup'") if sampleTable.BinGroup.str.contains("_").any(): logger.error( @@ -65,69 +60,67 @@ def validate_sample_table(sampleTable): exit(1) - - - def load_sample_table(sample_table="samples.tsv"): sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t") validate_sample_table(sampleTable) return sampleTable -def validate_bingroup_size_cobinning(sampleTable,logger): +def validate_bingroup_size_cobinning(sampleTable, logger): """ Validate that the bingroups are not too large, nor too small for co-binning. e.g. vamb and SemiBin """ - - bin_group_sizes = sampleTable.BinGroup.value_counts() + bin_group_sizes = sampleTable.BinGroup.value_counts() if bin_group_sizes.max() > 200: logger.warning( f"Found a bin group with more than 250 samples. This might lead to memory issues. \n {bin_group_sizes}" ) - if bin_group_sizes.min() <=5: + if bin_group_sizes.min() <= 5: logger.error( "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n" ) exit(1) -def validate_bingroup_size_metabat(sampleTable,logger): +def validate_bingroup_size_metabat(sampleTable, logger): bin_group_sizes = sampleTable.BinGroup.value_counts() max_bin_group_size = bin_group_sizes.max() - warn_message= ( "Co-binning with metabat uses cross-mapping which scales quadratically." - f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings." - + warn_message = ( + "Co-binning with metabat uses cross-mapping which scales quadratically." + f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings." ) - - if max_bin_group_size > 50: - logger.error( warn_message + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups." ) + logger.error( + warn_message + + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups." + ) exit(1) if max_bin_group_size > 10: - logger.warning( warn_message + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups.") - - elif max_bin_group_size ==1: - - logger.warning("You have only one sample per bingroup. This doesn't use the co-abundance information.") - - + logger.warning( + warn_message + + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups." + ) + elif max_bin_group_size == 1: -def validate_bingroup_size(sample, config,logger): + logger.warning( + "You have only one sample per bingroup. This doesn't use the co-abundance information." + ) +def validate_bingroup_size(sample, config, logger): - if config["final_binner"]=="DASTool": + if config["final_binner"] == "DASTool": binners = config["binner"] @@ -135,19 +128,19 @@ def validate_bingroup_size(sample, config,logger): if ("vamb" in binners) or ("SemiBin" in binners): - validate_bingroup_size_cobinning(sample,logger) + validate_bingroup_size_cobinning(sample, logger) if "metabat" in binners: - - validate_bingroup_size_metabat(sample,logger) - elif config["final_binner"]=="metabat": + validate_bingroup_size_metabat(sample, logger) - validate_bingroup_size_metabat(sample,logger) + elif config["final_binner"] == "metabat": - elif config["final_binner"] in ["vamb","SemiBin"]: + validate_bingroup_size_metabat(sample, logger) - validate_bingroup_size_cobinning(sample,logger) + elif config["final_binner"] in ["vamb", "SemiBin"]: + + validate_bingroup_size_cobinning(sample, logger) elif config["final_binner"] == "maxbin": @@ -155,13 +148,3 @@ def validate_bingroup_size(sample, config,logger): else: logger.error(f"Unknown final binner: {config['final_binner']}") - - - - - - - - - - diff --git a/workflow/Snakefile b/workflow/Snakefile index e626da30..3e73784c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,12 +33,10 @@ min_version("6.1") container: "docker://continuumio/miniconda3:4.4.10" - - - wildcard_constraints: binner="[A-Za-z]+", + include: "rules/sample_table.smk" include: "rules/download.smk" # contains hard coded variables include: "rules/qc.smk" diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index ce96e2fb..118e8e75 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -491,7 +491,6 @@ rule rename_contigs: " minscaf={params.minlength} &> {log} " - if config["filter_contigs"]: ruleorder: align_reads_to_prefilter_contigs > align_reads_to_final_contigs @@ -604,8 +603,6 @@ rule finalize_contigs: os.symlink(os.path.relpath(input[0], os.path.dirname(output[0])), output[0]) - - rule calculate_contigs_stats: input: "{sample}/{sample}_contigs.fasta", @@ -623,8 +620,6 @@ rule calculate_contigs_stats: "stats.sh in={input} format=3 out={output} &> {log}" - - # generalized rule so that reads from any "sample" can be aligned to contigs from "sample_contigs" rule align_reads_to_final_contigs: input: diff --git a/workflow/rules/cobinning.smk b/workflow/rules/cobinning.smk index c8fac0b5..6fd83ce4 100644 --- a/workflow/rules/cobinning.smk +++ b/workflow/rules/cobinning.smk @@ -33,22 +33,24 @@ def get_samples_of_bingroup(wildcards): return sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist() + def get_filtered_contigs_of_bingroup(wildcards): samples_of_group = get_samples_of_bingroup(wildcards) - if len(samples_of_group) <= 5: - raise ValueError(f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples." - "For cobinning we reccomend at least 5 samples per bin group." - "Adapt the sample.tsv to set BinGroup of size [5- 1000]" - ) + raise ValueError( + f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples." + "For cobinning we reccomend at least 5 samples per bin group." + "Adapt the sample.tsv to set BinGroup of size [5- 1000]" + ) - return { - #Trigers rerun if contigs change + # Trigers rerun if contigs change "flag": expand("{sample}/{sample}_contigs.fasta", sample=samples_of_group), - "fasta": ancient(expand(rules.filter_contigs.output[0], sample=samples_of_group)), + "fasta": ancient( + expand(rules.filter_contigs.output[0], sample=samples_of_group) + ), } @@ -56,13 +58,15 @@ def get_bams_of_bingroup(wildcards): samples_of_group = get_samples_of_bingroup(wildcards) - return expand("Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", sample= samples_of_group) + return expand( + "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", + sample=samples_of_group, + ) rule combine_contigs: input: unpack(get_filtered_contigs_of_bingroup), - output: "Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz", log: @@ -166,7 +170,7 @@ rule sort_bam: rule summarize_bam_contig_depths: input: - bams= get_bams_of_bingroup, + bams=get_bams_of_bingroup, output: "Intermediate/cobinning/{bingroup}/coverage.jgi.tsv", log: @@ -200,7 +204,7 @@ rule convert_jgi2vamb_coverage: rule run_vamb: input: - coverage= "Intermediate/cobinning/{bingroup}/coverage.tsv", + coverage="Intermediate/cobinning/{bingroup}/coverage.tsv", fasta=rules.combine_contigs.output, output: directory("Intermediate/cobinning/vamb_{bingroup}"), @@ -228,7 +232,6 @@ rule run_vamb: "2> {log}" - vamb_cluster_attribution_path = "{sample}/binning/vamb/cluster_attribution.tsv" @@ -238,16 +241,16 @@ localrules: rule parse_vamb_output: input: - expand(rules.run_vamb.output, bingroup = sampleTable.BinGroup.unique()), + expand(rules.run_vamb.output, bingroup=sampleTable.BinGroup.unique()), output: renamed_clusters="Intermediate/cobinning/vamb_clusters.tsv.gz", - cluster_atributions= expand(vamb_cluster_attribution_path, sample=SAMPLES), + cluster_atributions=expand(vamb_cluster_attribution_path, sample=SAMPLES), log: "logs/cobinning/vamb_parse_output.log", params: separator=config["cobinning_separator"], fasta_extension=".fna", - output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace + output_path=lambda wc: vamb_cluster_attribution_path, # path with {sample} to replace samples=SAMPLES, conda: "../envs/fasta.yaml" diff --git a/workflow/rules/sample_table.smk b/workflow/rules/sample_table.smk index 02ef47ae..c748a38e 100644 --- a/workflow/rules/sample_table.smk +++ b/workflow/rules/sample_table.smk @@ -1,9 +1,8 @@ - - -from atlas.sample_table import load_sample_table,validate_bingroup_size +from atlas.sample_table import load_sample_table, validate_bingroup_size sampleTable = load_sample_table() -validate_bingroup_size(sampleTable,config,logger) +validate_bingroup_size(sampleTable, config, logger) + def io_params_for_tadpole(io, key="in"): """This function generates the input flag needed for bbwrap/tadpole for all cases @@ -183,4 +182,4 @@ def get_quality_controlled_reads(wildcards, include_se=False): def get_assembly(wildcards): - return "{sample}/assembly/{sample}_contigs.fasta".format(sample=wildcards.sample ) + return "{sample}/assembly/{sample}_contigs.fasta".format(sample=wildcards.sample) diff --git a/workflow/rules/semibin.smk b/workflow/rules/semibin.smk index 01d6d0e6..dc398c28 100644 --- a/workflow/rules/semibin.smk +++ b/workflow/rules/semibin.smk @@ -4,7 +4,7 @@ rule semibin_generate_data_multi: fasta=rules.combine_contigs.output, bams=get_bams_of_bingroup, output: - directory("Intermediate/cobinning/{bingroup}/semibin/data_multi") + directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"), # expand( # "Cobinning/SemiBin/samples/{sample}/{files}", # sample=SAMPLES, @@ -35,10 +35,10 @@ rule semibin_generate_data_multi: rule semibin_train: input: - flag = "{sample}/{sample}_contigs.fasta", - fasta_sample = rules.filter_contigs.output, - bams= get_bams_of_bingroup, - data_folder= rules.semibin_generate_data_multi.output[0], + flag="{sample}/{sample}_contigs.fasta", + fasta_sample=rules.filter_contigs.output, + bams=get_bams_of_bingroup, + data_folder=rules.semibin_generate_data_multi.output[0], output: "Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5", conda: @@ -53,8 +53,14 @@ rule semibin_train: "logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv" params: output_dir=lambda wc, output: os.path.dirname(output[0]), - data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv", - data_split = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data_split.csv", + data=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data.csv", + data_split=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data_split.csv", extra=config["semibin_train_extra"], shell: "SemiBin train_self " @@ -69,23 +75,31 @@ rule semibin_train: def semibin_input(wildcards): bingroup_of_sample = sampleTable.loc[wildcards.sample, "bingroup"] - samples_of_bingroup = sampleTable.query(f'BinGroup=="{bingroup_of_sample}"').index.tolist() - + samples_of_bingroup = sampleTable.query( + f'BinGroup=="{bingroup_of_sample}"' + ).index.tolist() return dict( - flag= "{sample}/{sample}_contigs.fasta", - fasta = rules.filter_contigs.output, - bams =lambda wc: expand(rules.sort_bam.output, sample= samples_of_bingroup), - data_folder = rules.semibin_generate_data_multi.output[0].format(bingroup=bingroup_of_sample), - model = rules.semibin_train.output[0].format(bingroup=bingroup_of_sample, sample=wildcards.sample), + flag="{sample}/{sample}_contigs.fasta", + fasta=rules.filter_contigs.output, + bams=lambda wc: expand(rules.sort_bam.output, sample=samples_of_bingroup), + data_folder=rules.semibin_generate_data_multi.output[0].format( + bingroup=bingroup_of_sample + ), + model=rules.semibin_train.output[0].format( + bingroup=bingroup_of_sample, sample=wildcards.sample + ), ) + rule run_semibin: input: unpack(semibin_input), output: # contains no info to bingroup - directory("Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"), + directory( + "Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/" + ), conda: "../envs/semibin.yaml" threads: config["threads"] @@ -97,8 +111,11 @@ rule run_semibin: benchmark: "logs/benchmarks/semibin/bin/{sample}.tsv" params: - output_dir= lambda wc, output: os.path.dirname(output[0]), - data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv", + output_dir=lambda wc, output: os.path.dirname(output[0]), + data=lambda wc, input: Path(input.data_folder) + / "samples" + / wc.sample + / "data.csv", min_bin_kbs=int(config["cobining_min_bin_size"] / 1000), extra=config["semibin_options"], shell: diff --git a/workflow/scripts/parse_vamb.py b/workflow/scripts/parse_vamb.py index 7efa91a5..c6e36344 100644 --- a/workflow/scripts/parse_vamb.py +++ b/workflow/scripts/parse_vamb.py @@ -51,9 +51,11 @@ def handle_exception(exc_type, exc_value, exc_traceback): vamb_folder = Path(vamb_folder) # extract the bingroup name from the path - vamb_folder_name= vamb_folder.parts[-1] - assert vamb_folder_name.startswith("vamb_"), f"Folder {vamb_folder} does not start with vamb_" - bingroup = vamb_folder_name[len("vamb_"):] + vamb_folder_name = vamb_folder.parts[-1] + assert vamb_folder_name.startswith( + "vamb_" + ), f"Folder {vamb_folder} does not start with vamb_" + bingroup = vamb_folder_name[len("vamb_") :] logging.info(f"Parse vamb output for bingroup {bingroup}") @@ -72,13 +74,11 @@ def handle_exception(exc_type, exc_value, exc_traceback): if extension == fasta_extension: big_bins.append(bin_name) - logging.info( f"Found {len(big_bins)} bins created by Vamb (above size limit)\n" f"E.g. {big_bins[:5]}" ) - logging.info(f"Load vamb cluster file {vamb_cluster_file}") clusters_contigs = pd.read_table(vamb_cluster_file, header=None) clusters_contigs.columns = ["OriginalName", "Contig"] @@ -88,15 +88,15 @@ def handle_exception(exc_type, exc_value, exc_traceback): clusters.columns = ["Sample", "Contig"] # get number of BinID given by vamb, prefix with bingroup - clusters["BinID"] = bingroup + clusters_contigs.OriginalName.str.rsplit( - separator, n=1, expand=True - )[1] + clusters["BinID"] = ( + bingroup + + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1] + ) # Add information if the bin is large enough clusters["OriginalName"] = clusters_contigs.OriginalName clusters["Large_enough"] = clusters.OriginalName.isin(big_bins) - # Add information about the bingroup clusters["BinGroup"] = bingroup @@ -106,20 +106,19 @@ def handle_exception(exc_type, exc_value, exc_traceback): logging.info(f"Concatenate all clusters") -clusters = pd.concat(all_clusters,axis=0) - - +clusters = pd.concat(all_clusters, axis=0) logging.info(f"Write reformated table to {output_culsters}") clusters.to_csv(output_culsters, sep="\t", index=False) - -n_bins= clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique() -logging.info(f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}") - - +n_bins = ( + clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique() +) +logging.info( + f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}" +) clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinID