From a7dba063189409078864acc16228c31dcf5f7030 Mon Sep 17 00:00:00 2001 From: nacnoriko Date: Thu, 20 Jul 2023 16:13:22 +0200 Subject: [PATCH] add first inphared-db wrapper --- bio/reference/inphared-db/environment.yaml | 5 ++ bio/reference/inphared-db/meta.yaml | 4 + bio/reference/inphared-db/old_wrapper.py | 80 +++++++++++++++++++ bio/reference/inphared-db/test/Snakefile | 12 +++ bio/reference/inphared-db/test/config.yaml | 9 +++ .../inphared-db/test/old_release.smk | 29 +++++++ .../inphared-db/test/old_snakefile.smk | 30 +++++++ bio/reference/inphared-db/wrapper.py | 9 +++ 8 files changed, 178 insertions(+) create mode 100644 bio/reference/inphared-db/environment.yaml create mode 100644 bio/reference/inphared-db/meta.yaml create mode 100644 bio/reference/inphared-db/old_wrapper.py create mode 100644 bio/reference/inphared-db/test/Snakefile create mode 100644 bio/reference/inphared-db/test/config.yaml create mode 100644 bio/reference/inphared-db/test/old_release.smk create mode 100644 bio/reference/inphared-db/test/old_snakefile.smk create mode 100644 bio/reference/inphared-db/wrapper.py diff --git a/bio/reference/inphared-db/environment.yaml b/bio/reference/inphared-db/environment.yaml new file mode 100644 index 00000000000..3f8f6dadf4a --- /dev/null +++ b/bio/reference/inphared-db/environment.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - nodefaults +dependencies: + - curl diff --git a/bio/reference/inphared-db/meta.yaml b/bio/reference/inphared-db/meta.yaml new file mode 100644 index 00000000000..48abf517578 --- /dev/null +++ b/bio/reference/inphared-db/meta.yaml @@ -0,0 +1,4 @@ +name: inphared-db +description: Download sequence file from the Inphared database (https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the current database available at the above link and adjust the config file. +authors: + - Noriko A. Cassman diff --git a/bio/reference/inphared-db/old_wrapper.py b/bio/reference/inphared-db/old_wrapper.py new file mode 100644 index 00000000000..50ea7d46b96 --- /dev/null +++ b/bio/reference/inphared-db/old_wrapper.py @@ -0,0 +1,80 @@ +__author__ = "Johannes Köster" +__copyright__ = "Copyright 2019, Johannes Köster" +__email__ = "johannes.koester@uni-due.de" +__license__ = "MIT" + +import subprocess as sp +import sys +from itertools import product +from snakemake.shell import shell + +species = snakemake.params.species.lower() +release = int(snakemake.params.release) +build = snakemake.params.build + +branch = "" +if release >= 81 and build == "GRCh37": + # use the special grch37 branch for new releases + branch = "grch37/" +elif snakemake.params.get("branch"): + branch = snakemake.params.branch + "/" + +log = snakemake.log_fmt_shell(stdout=False, stderr=True) + +spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( + build=build, release=release +) + +suffixes = "" +datatype = snakemake.params.get("datatype", "") +chromosome = snakemake.params.get("chromosome", "") +if datatype == "dna": + if chromosome: + suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)] + else: + suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"] +elif datatype == "cdna": + suffixes = ["cdna.all.fa.gz"] +elif datatype == "cds": + suffixes = ["cds.all.fa.gz"] +elif datatype == "ncrna": + suffixes = ["ncrna.fa.gz"] +elif datatype == "pep": + suffixes = ["pep.all.fa.gz"] +else: + raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep") + +if chromosome: + if not datatype == "dna": + raise ValueError( + "invalid datatype, to select a single chromosome the datatype must be dna" + ) + +spec = spec.format(build=build, release=release) +url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" + +success = False +for suffix in suffixes: + url = f"{url_prefix}.{suffix}" + + try: + shell("curl -sSf {url} > /dev/null 2> /dev/null") + except sp.CalledProcessError: + continue + + shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}") + success = True + break + +if not success: + if len(suffixes) > 1: + url = f"{url_prefix}.[{'|'.join(suffixes)}]" + else: + url = f"{url_prefix}.{suffixes[0]}" + print( + f"Unable to download requested sequence data from Ensembl ({url}). " + "Please check whether above URL is currently available (might be a temporal server issue). " + "Apart from that, did you check that this combination of species, build, and release is actually provided?", + file=sys.stderr, + ) + exit(1) diff --git a/bio/reference/inphared-db/test/Snakefile b/bio/reference/inphared-db/test/Snakefile new file mode 100644 index 00000000000..0c09c919190 --- /dev/null +++ b/bio/reference/inphared-db/test/Snakefile @@ -0,0 +1,12 @@ +configfile: "config.yaml" + +rule get_inphareddb: + output: + expand("{date}{suffix}", date=config["date"], suffix=config["suffix"]) + params: + prefix = config["prefix"], + date = config["date"], + suffix = config["suffix"] + wrapper: + "master/bio/reference/inphared-db" + diff --git a/bio/reference/inphared-db/test/config.yaml b/bio/reference/inphared-db/test/config.yaml new file mode 100644 index 00000000000..0f3ca1d9ef8 --- /dev/null +++ b/bio/reference/inphared-db/test/config.yaml @@ -0,0 +1,9 @@ +date: + "2Jul2023" + +suffix: + "_refseq_genomes.fa" + #"_genomes_excluding_refseq.fa" + +prefix: + "https://millardlab-inphared.s3.climb.ac.uk/" diff --git a/bio/reference/inphared-db/test/old_release.smk b/bio/reference/inphared-db/test/old_release.smk new file mode 100644 index 00000000000..a698b982d04 --- /dev/null +++ b/bio/reference/inphared-db/test/old_release.smk @@ -0,0 +1,29 @@ +rule get_genome: + output: + "refs/genome.fasta", + params: + species="saccharomyces_cerevisiae", + datatype="dna", + build="R64-1-1", + release="75", + log: + "logs/get_genome.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" + + +rule get_chromosome: + output: + "refs/old_release.chr1.fasta", + params: + species="saccharomyces_cerevisiae", + datatype="dna", + build="R64-1-1", + release="75", + chromosome="I", + log: + "logs/get_genome.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" diff --git a/bio/reference/inphared-db/test/old_snakefile.smk b/bio/reference/inphared-db/test/old_snakefile.smk new file mode 100644 index 00000000000..79f249a0d3b --- /dev/null +++ b/bio/reference/inphared-db/test/old_snakefile.smk @@ -0,0 +1,30 @@ +rule get_genome: + output: + "refs/genome.fasta", + params: + species="saccharomyces_cerevisiae", + datatype="dna", + build="R64-1-1", + release="98", + log: + "logs/get_genome.log", + cache: "mit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" + + +rule get_chromosome: + output: + "refs/chr1.fasta", + params: + species="saccharomyces_cerevisiae", + datatype="dna", + build="R64-1-1", + release="101", + chromosome="I", # optional: restrict to chromosome + # branch="plants", # optional: specify branch + log: + "logs/get_genome.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" diff --git a/bio/reference/inphared-db/wrapper.py b/bio/reference/inphared-db/wrapper.py new file mode 100644 index 00000000000..be912f72f31 --- /dev/null +++ b/bio/reference/inphared-db/wrapper.py @@ -0,0 +1,9 @@ +__author__ = "Noriko A. Cassman" +__copyright__ = "Copyright 2023, Noriko A. Cassman" +__email__ = "noriko.cassman@gmail.com" +__license__ = "MIT" + +from snakemake.shell import shell + + shell: + "curl {params.prefix}{params.date}{params.suffix} -o {params.date}{params.suffix}"