From 8d30eeadc9ce7cdb80e0f75fb96ee3e6dcefeb8b Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 6 Nov 2024 23:29:24 +0100 Subject: [PATCH 1/6] fix: get ensembl-reference wrapper to download more than one chromosome --- bio/reference/ensembl-sequence/wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index cb2956a6c04..48ae87b8054 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -31,6 +31,7 @@ if datatype == "dna": if chromosome: suffixes = [f"dna.chromosome.{chrom}.fa.gz" for chrom in chromosome] + print(suffixes) else: suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"] elif datatype == "cdna": @@ -62,7 +63,7 @@ shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: continue - + print(url) shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}") success = True break From dfdd74347470179d288ee0e2c62d58c1ab933879 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 6 Nov 2024 23:34:52 +0100 Subject: [PATCH 2/6] different debugging output --- bio/reference/ensembl-sequence/wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index 48ae87b8054..dd1819162a2 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -58,12 +58,12 @@ success = False for suffix in suffixes: url = f"{url_prefix}.{suffix}" - + print(url) try: shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: continue - print(url) + shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}") success = True break From 8fae1dd396a47c7814540f49af65dd0c91ba6da9 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 6 Nov 2024 23:41:43 +0100 Subject: [PATCH 3/6] fix: avoid `break` for chromosome specs additional debugging output --- bio/reference/ensembl-sequence/wrapper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index dd1819162a2..9e2d644b1fd 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -48,7 +48,7 @@ if chromosome: if not datatype == "dna": raise ValueError( - "invalid datatype, to select a single chromosome the datatype must be dna" + "Invalid datatype. To select individual chromosomes, the datatype must be dna." ) url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") @@ -56,6 +56,7 @@ url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" success = False +print(suffixes) for suffix in suffixes: url = f"{url_prefix}.{suffix}" print(url) @@ -66,7 +67,8 @@ shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}") success = True - break + if not chromosome: + break if not success: if len(suffixes) > 1: From 249d5e4a8cd543effd84b5691661b2b1f2394113 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 6 Nov 2024 23:45:27 +0100 Subject: [PATCH 4/6] remove debugging output --- bio/reference/ensembl-sequence/wrapper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index 9e2d644b1fd..d75adefc7e2 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -31,7 +31,6 @@ if datatype == "dna": if chromosome: suffixes = [f"dna.chromosome.{chrom}.fa.gz" for chrom in chromosome] - print(suffixes) else: suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"] elif datatype == "cdna": @@ -56,15 +55,14 @@ url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" success = False -print(suffixes) for suffix in suffixes: url = f"{url_prefix}.{suffix}" - print(url) + try: shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: continue - + shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}") success = True if not chromosome: From 23b878875f724b5898b7d8e6dff302024d97d281 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 7 Nov 2024 00:00:02 +0100 Subject: [PATCH 5/6] reset success status for every try of a chromosome --- bio/reference/ensembl-sequence/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index d75adefc7e2..58aa0645864 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -54,8 +54,8 @@ spec = spec.format(build=build, release=release) url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" -success = False for suffix in suffixes: + success = False url = f"{url_prefix}.{suffix}" try: From 19536bca3bc7531430e5476d387484919d24feec Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 8 Nov 2024 12:03:44 +0100 Subject: [PATCH 6/6] fix chromosome download failure logic and error message --- bio/reference/ensembl-sequence/wrapper.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index 58aa0645864..791728fa8ab 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -61,7 +61,14 @@ try: shell("curl -sSf {url} > /dev/null 2> /dev/null") except sp.CalledProcessError: - continue + if chromosome: + print( + f"Unable to download the requested chromosome sequence from Ensembl at: {url_prefix}.{suffix}.", + file=sys.stderr, + ) + break + else: + continue shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}") success = True @@ -69,12 +76,16 @@ break if not success: - if len(suffixes) > 1: - url = f"{url_prefix}.[{'|'.join(suffixes)}]" - else: - url = f"{url_prefix}.{suffixes[0]}" + if not chromosome: + if len(suffixes) > 1: + url = f"{url_prefix}.[{'|'.join(suffixes)}]" + else: + url = f"{url_prefix}.{suffixes[0]}" + print( + f"Unable to download the requested reference sequence data from Ensembl at: {url}.", + file=sys.stderr, + ) print( - f"Unable to download requested sequence data from Ensembl ({url}). " "Please check whether above URL is currently available (might be a temporal server issue). " "Apart from that, did you check that this combination of species, build, and release is actually provided?", file=sys.stderr,