From 9872a2b7e2bc8fa8036f037ebc2e19720af06f22 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Mon, 30 Oct 2023 15:47:37 +0100 Subject: [PATCH] more methods + updated populations --- .../config_distance/config.yaml | 2 + .../config_distance/params.csv | 57 ++++--- .../viloca_alpha_0.000001_K300.py | 144 ++++++++++++++++++ 3 files changed, 185 insertions(+), 18 deletions(-) create mode 100644 resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001_K300.py diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/config.yaml b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/config.yaml index e2f5888b..15fb1782 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/config.yaml +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/config.yaml @@ -5,6 +5,8 @@ method_list: [lofreq_local_haplo, cliquesnv_local_haplo_tf0.01, cliquesnv_local_haplo_snv_tf0.01, viloca_alpha_0.000001, + viloca_alpha_0.000001_K300, + viloca_alpha_0.00001, viloca_alpha_0.00001_K100, # shorah_default_amplicon, --> here the short illumina simulation still not work due to b2w errors ] diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/params.csv b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/params.csv index 6c61265d..de190433 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/params.csv +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/config_distance/params.csv @@ -1,9 +1,10 @@ seq_tech,seq_mode,seq_mode_param,read_length,genome_size,coverage,haplos # Population 1 illumina,shotgun,single_amplicon,249,249,1000,2@3@30@20@20@geom@0.75 +illumina,shotgun,single_amplicon,201,201,1000,2@3@30@20@20@geom@0.75 # Population 2 illumina,shotgun,single_amplicon,249,249,1000,5@5@20@4@12@geom@0.75 -illumina,shotgun,single_amplicon,201,201,1000,5@5@20@4@12@geom@0.75 +illumina,shotgun,single_amplicon,201,201,1000,5@5@20@4@15@geom@0.75 # Population 3 (vary covverage) illumina,shotgun,single_amplicon,249,249,100,5@5@30@10@15@geom@0.75 illumina,shotgun,single_amplicon,249,249,500,5@5@30@10@15@geom@0.75 @@ -11,22 +12,33 @@ illumina,shotgun,single_amplicon,249,249,800,5@5@30@10@15@geom@0.75 illumina,shotgun,single_amplicon,249,249,1000,5@5@30@10@15@geom@0.75 illumina,shotgun,single_amplicon,249,249,5000,5@5@30@10@15@geom@0.75 illumina,shotgun,single_amplicon,249,249,10000,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,100,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,500,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,800,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,1000,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,5000,5@5@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,10000,5@5@30@10@15@geom@0.75 # Population 4 illumina,shotgun,single_amplicon,201,201,1000,5@10@30@10@10@geom@0.75 illumina,shotgun,single_amplicon,249,249,1000,5@10@30@10@10@geom@0.75 +illumina,shotgun,single_amplicon,201,201,1000,5@10@30@10@15@geom@0.75 illumina,shotgun,single_amplicon,249,249,1000,5@10@30@10@15@geom@0.75 # Population 5 illumina,shotgun,single_amplicon,249,249,1000,5@15@30@10@15@geom@0.75 +illumina,shotgun,single_amplicon,201,201,1000,5@15@30@10@15@geom@0.75 # Population 6 -#nanopore,shotgun,single_amplicon,5000,5000,800,2@3@600@400@400@geom@0.75 -#nanopore,shotgun,single_amplicon,5000,5000,5000,2@3@600@400@400@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,2@3@600@400@400@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,2000,2@3@600@400@400@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@geom@0.75 # Population 7 -#nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@240@geom@0.75 -#nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@400@80@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,2000,5@5@400@80@240@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@200@geom@0.75 +#nanopore,shotgun,single_amplicon,5000,5000,2000,5@5@400@80@200@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@200@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@geom@0.75 # Population 8 (vary coverage) nanopore,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@geom@0.75 @@ -34,19 +46,28 @@ nanopore,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,100,5@5@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,200,5@5@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,500,5@5@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,5@5@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@240@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@geom@0.75 # Population 9 -#nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@300@geom@0.75 -#nanopore,shotgun,single_amplicon,5000,5000,5000,5@10@600@200@300@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@300@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,2000,5@10@600@200@300@geom@0.75 nanopore,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,2000,5@10@600@200@240@geom@0.75 +nanopore,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@240@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@geom@0.75 # Population 10 #nanopore,shotgun,single_amplicon,5000,5000,800,15@5@600@300@200@geom@0.75 --> coverage_haplotype 0 #nanopore,shotgun,single_amplicon,5000,5000,5000,15@5@600@300@200@geom@0.75 -nanopore,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@geom@0.75 -pacbio,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@geom@0.75 +#nanopore,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@geom@0.75 +#pacbio,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@geom@0.75 diff --git a/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001_K300.py b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001_K300.py new file mode 100644 index 00000000..9773a9a1 --- /dev/null +++ b/resources/auxiliary_workflows/benchmark/resources/method_definitions/viloca_alpha_0.000001_K300.py @@ -0,0 +1,144 @@ +# GROUP: global +# CONDA: libshorah +# CONDA: biopython = 1.79 +# PIP: git+https://github.com/LaraFuhrmann/shorah@master + +import subprocess +from pathlib import Path +from os import listdir +from os.path import isfile, join +from Bio import SeqIO +import gzip + + +def gunzip(source_filepath, dest_filepath, block_size=65536): + with gzip.open(source_filepath, "rb") as s_file, open( + dest_filepath, "wb" + ) as d_file: + while True: + block = s_file.read(block_size) + if not block: + break + else: + d_file.write(block) + + +def main( + fname_bam, + fname_reference, + fname_insert_bed, + fname_results_snv, + fname_result_haplos, + dname_work, + threads=1, +): + genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0] + alpha = 0.000001 + n_max_haplotypes = 300 + n_mfa_starts = 1 + win_min_ext = 0.85 + + read_length = str(fname_bam).split("read_length~")[1].split("__")[0] + if read_length == "Ten_strain_IAV": + sampler = "learn_error_params" + win_min_ext = 0.7 + else: + sampler = "use_quality_scores" + + dname_work.mkdir(parents=True, exist_ok=True) + if fname_insert_bed == []: + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + "--threads", + str(threads), + ], + cwd=dname_work, + ) + else: + # insert bed file is there + subprocess.run( + [ + "shorah", + "shotgun", + "-b", + fname_bam.resolve(), + "-f", + Path(fname_reference).resolve(), + "-z", + Path(fname_insert_bed).resolve(), + "--mode", + str(sampler), + "--alpha", + str(alpha), + "--n_max_haplotypes", + str(n_max_haplotypes), + "--n_mfa_starts", + str(n_mfa_starts), + "--win_min_ext", + str(win_min_ext), + "--threads", + str(threads), + ], + cwd=dname_work, + ) + + (dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv) + + mypath = (dname_work / "support").resolve() + onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] + print("onlyfiles", onlyfiles) + for file in onlyfiles: + if "reads-support.fas" in file: + file_name = onlyfiles[0] + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + if file.endswith(".gz"): + fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve() + fname_haplos = onlyfiles[0].split(".gz")[0] + fname_unzipped = (dname_work / "support" / fname_haplos).resolve() + # unzip + gunzip(fname_zipped, fname_result_haplos) + + elif file.endswith(".fas"): + fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve() + (dname_work / "support" / file).rename(fname_result_haplos) + + # fix frequency information + + freq_list = [] + for record in SeqIO.parse(fname_result_haplos, "fasta"): + freq_list.append(float(record.description.split("ave_reads=")[-1])) + norm_freq_list = [float(i) / sum(freq_list) for i in freq_list] + + record_list = [] + for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")): + record.description = f"freq:{norm_freq_list[idx]}" + record_list.append(record) + SeqIO.write(record_list, fname_result_haplos, "fasta") + + +if __name__ == "__main__": + main( + Path(snakemake.input.fname_bam), + Path(snakemake.input.fname_reference), + snakemake.input.fname_insert_bed, + Path(snakemake.output.fname_result), + Path(snakemake.output.fname_result_haplos), + Path(snakemake.output.dname_work), + snakemake.threads, + )