Skip to content

Commit

Permalink
more methods + updated populations
Browse files Browse the repository at this point in the history
  • Loading branch information
LaraFuhrmann committed Oct 30, 2023
1 parent 9795475 commit 9872a2b
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ method_list: [lofreq_local_haplo,
cliquesnv_local_haplo_tf0.01,
cliquesnv_local_haplo_snv_tf0.01,
viloca_alpha_0.000001,
viloca_alpha_0.000001_K300,
viloca_alpha_0.00001,
viloca_alpha_0.00001_K100,
# shorah_default_amplicon, --> here the short illumina simulation still not work due to b2w errors
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,52 +1,73 @@
seq_tech,seq_mode,seq_mode_param,read_length,genome_size,coverage,haplos
# Population 1
illumina,shotgun,single_amplicon,249,249,1000,2@3@30@20@20@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,2@3@30@20@20@[email protected]
# Population 2
illumina,shotgun,single_amplicon,249,249,1000,5@5@20@4@12@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,5@5@20@4@12@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,5@5@20@4@15@[email protected]
# Population 3 (vary covverage)
illumina,shotgun,single_amplicon,249,249,100,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,500,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,800,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,1000,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,5000,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,10000,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,100,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,500,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,800,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,5000,5@5@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,10000,5@5@30@10@15@[email protected]
# Population 4
illumina,shotgun,single_amplicon,201,201,1000,5@10@30@10@10@[email protected]
illumina,shotgun,single_amplicon,249,249,1000,5@10@30@10@10@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,5@10@30@10@15@[email protected]
illumina,shotgun,single_amplicon,249,249,1000,5@10@30@10@15@[email protected]
# Population 5
illumina,shotgun,single_amplicon,249,249,1000,5@15@30@10@15@[email protected]
illumina,shotgun,single_amplicon,201,201,1000,5@15@30@10@15@[email protected]
# Population 6
#nanopore,shotgun,single_amplicon,5000,5000,800,2@3@600@400@400@[email protected]
#nanopore,shotgun,single_amplicon,5000,5000,5000,2@3@600@400@400@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,2@3@600@400@400@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,2000,2@3@600@400@400@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,1000,2@3@600@400@400@[email protected]
# Population 7
#nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@240@[email protected]
#nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@400@80@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,2000,5@5@400@80@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@5@400@80@200@[email protected]
#nanopore,shotgun,single_amplicon,5000,5000,2000,5@5@400@80@200@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@200@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@400@80@240@[email protected]
# Population 8 (vary coverage)
nanopore,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,100,5@5@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,200,5@5@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,500,5@5@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@5@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@240@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,100,5@5@600@200@300@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,200,5@5@600@200@300@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,500,5@5@600@200@300@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,800,5@5@600@200@300@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,1000,5@5@600@200@300@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,5000,5@5@600@200@300@[email protected]
# Population 9
#nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@300@[email protected]
#nanopore,shotgun,single_amplicon,5000,5000,5000,5@10@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,2000,5@10@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,800,5@10@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,2000,5@10@600@200@240@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@240@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,1000,5@10@600@200@300@[email protected]
# Population 10
#nanopore,shotgun,single_amplicon,5000,5000,800,15@5@600@300@200@[email protected] --> coverage_haplotype 0
#nanopore,shotgun,single_amplicon,5000,5000,5000,15@5@600@300@200@[email protected]
nanopore,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@[email protected]
pacbio,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@[email protected]
#nanopore,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@[email protected]
#pacbio,shotgun,single_amplicon,5000,5000,1000,15@5@600@300@200@[email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# GROUP: global
# CONDA: libshorah
# CONDA: biopython = 1.79
# PIP: git+https://github.com/LaraFuhrmann/shorah@master

import subprocess
from pathlib import Path
from os import listdir
from os.path import isfile, join
from Bio import SeqIO
import gzip


def gunzip(source_filepath, dest_filepath, block_size=65536):
with gzip.open(source_filepath, "rb") as s_file, open(
dest_filepath, "wb"
) as d_file:
while True:
block = s_file.read(block_size)
if not block:
break
else:
d_file.write(block)


def main(
fname_bam,
fname_reference,
fname_insert_bed,
fname_results_snv,
fname_result_haplos,
dname_work,
threads=1,
):
genome_size = str(fname_bam).split("genome_size~")[1].split("__coverage")[0]
alpha = 0.000001
n_max_haplotypes = 300
n_mfa_starts = 1
win_min_ext = 0.85

read_length = str(fname_bam).split("read_length~")[1].split("__")[0]
if read_length == "Ten_strain_IAV":
sampler = "learn_error_params"
win_min_ext = 0.7
else:
sampler = "use_quality_scores"

dname_work.mkdir(parents=True, exist_ok=True)
if fname_insert_bed == []:
subprocess.run(
[
"shorah",
"shotgun",
"-b",
fname_bam.resolve(),
"-f",
Path(fname_reference).resolve(),
"--mode",
str(sampler),
"--alpha",
str(alpha),
"--n_max_haplotypes",
str(n_max_haplotypes),
"--n_mfa_starts",
str(n_mfa_starts),
"--win_min_ext",
str(win_min_ext),
"--threads",
str(threads),
],
cwd=dname_work,
)
else:
# insert bed file is there
subprocess.run(
[
"shorah",
"shotgun",
"-b",
fname_bam.resolve(),
"-f",
Path(fname_reference).resolve(),
"-z",
Path(fname_insert_bed).resolve(),
"--mode",
str(sampler),
"--alpha",
str(alpha),
"--n_max_haplotypes",
str(n_max_haplotypes),
"--n_mfa_starts",
str(n_mfa_starts),
"--win_min_ext",
str(win_min_ext),
"--threads",
str(threads),
],
cwd=dname_work,
)

(dname_work / "snv" / "SNVs_0.010000_final.vcf").rename(fname_results_snv)

mypath = (dname_work / "support").resolve()
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print("onlyfiles", onlyfiles)
for file in onlyfiles:
if "reads-support.fas" in file:
file_name = onlyfiles[0]
fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve()
if file.endswith(".gz"):
fname_zipped = (dname_work / "support" / onlyfiles[0]).resolve()
fname_haplos = onlyfiles[0].split(".gz")[0]
fname_unzipped = (dname_work / "support" / fname_haplos).resolve()
# unzip
gunzip(fname_zipped, fname_result_haplos)

elif file.endswith(".fas"):
fname_haplos = (dname_work / "support" / onlyfiles[0]).resolve()
(dname_work / "support" / file).rename(fname_result_haplos)

# fix frequency information

freq_list = []
for record in SeqIO.parse(fname_result_haplos, "fasta"):
freq_list.append(float(record.description.split("ave_reads=")[-1]))
norm_freq_list = [float(i) / sum(freq_list) for i in freq_list]

record_list = []
for idx, record in enumerate(SeqIO.parse(fname_result_haplos, "fasta")):
record.description = f"freq:{norm_freq_list[idx]}"
record_list.append(record)
SeqIO.write(record_list, fname_result_haplos, "fasta")


if __name__ == "__main__":
main(
Path(snakemake.input.fname_bam),
Path(snakemake.input.fname_reference),
snakemake.input.fname_insert_bed,
Path(snakemake.output.fname_result),
Path(snakemake.output.fname_result_haplos),
Path(snakemake.output.dname_work),
snakemake.threads,
)

0 comments on commit 9872a2b

Please sign in to comment.