Skip to content

Commit

Permalink
Merge pull request #27 from pachterlab/dev
Browse files Browse the repository at this point in the history
Added plants to gget ref
  • Loading branch information
lauraluebbert authored Jul 8, 2022
2 parents 2e559be + ed884b7 commit 114084f
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 84 deletions.
2 changes: 1 addition & 1 deletion gget/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)

__version__ = "0.2.5"
__version__ = "0.2.6"
__author__ = "Laura Luebbert"
__email__ = "[email protected]"
1 change: 1 addition & 0 deletions gget/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Ensembl REST API server for gget seq and info
ENSEMBL_REST_API = "http://rest.ensembl.org/"
ENSEMBL_FTP_URL = "http://ftp.ensembl.org/pub/"
ENSEMBL_FTP_URL_PLANT = "http://ftp.ensemblgenomes.org/pub/plants/"

# NCBI URL for gget info
NCBI_URL = "https://www.ncbi.nlm.nih.gov"
Expand Down
134 changes: 60 additions & 74 deletions gget/gget_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
# Custom functions
from .utils import ref_species_options, find_latest_ens_rel

from .constants import ENSEMBL_FTP_URL
from .constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_PLANT


def ref(species, which="all", release=None, ftp=False, save=False, list_species=False):
"""
Fetch FTPs for reference genomes and annotations by species.
Fetch FTPs for reference genomes and annotations by species from Ensembl.
Args:
- species Defines the species for which the reference should be fetched in the format "<genus>_<species>",
Expand All @@ -42,7 +42,8 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
- ftp Return only the requested FTP links in a list (default: False).
- save Save the results in the local directory (default: False).
- list_species If True and `species=None`, returns a list of all available species (default: False).
- list_species If True and `species=None`, returns a list of all available species from the Ensembl database for large genomes
(not including plants/bacteria) (default: False).
(Can be combined with `release` to get the available species from a specific Ensembl release.)
Returns a dictionary containing the requested URLs with their respective Ensembl version and release date and time.
Expand Down Expand Up @@ -81,26 +82,17 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# In case species was passed with upper case letters
species = species.lower()

## Find latest Ensembl release
url = ENSEMBL_FTP_URL
html = requests.get(url)

# Raise error if status code not "OK" Response
if html.status_code != 200:
raise RuntimeError(
f"Ensembl FTP site returned error status code {html.status_code}. Please try again."
)

soup = BeautifulSoup(html.text, "html.parser")
# Find all releases
releases = soup.body.findAll(text=re.compile("release-"))
# Get release numbers
rels = []
for rel in releases:
rels.append(rel.split("/")[0].split("-")[-1])
## For plants and bacteria, switch to other databases
# Check if species in plant or bacteria Ensembl database
if species in ref_species_options(
"dna", database=ENSEMBL_FTP_URL_PLANT, release=None
):
database = ENSEMBL_FTP_URL_PLANT
else:
database = ENSEMBL_FTP_URL

# Find highest release number (= latest release)
ENS_rel = np.array(rels).astype(int).max()
## Find latest Ensembl release
ENS_rel = find_latest_ens_rel(database)

# If release != None, use user-defined Ensembl release
if release != None:
Expand All @@ -114,9 +106,9 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=

## Raise error if species not found
# Find all available species for GTFs for this Ensembl release
species_list_gtf = ref_species_options("gtf", release=ENS_rel)
species_list_gtf = ref_species_options("gtf", database=database, release=ENS_rel)
# Find all available species for FASTAs for this Ensembl release
species_list_dna = ref_species_options("dna", release=ENS_rel)
species_list_dna = ref_species_options("dna", database=database, release=ENS_rel)

# Find intersection of the two lists
# (Only species which have GTF and FASTAs available can continue)
Expand All @@ -126,12 +118,12 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
raise ValueError(
f"Species does not match any available species for Ensembl release {ENS_rel}.\n"
"Please double-check spelling.\n"
"'gget ref --list' -> lists out all available species (Python: 'gget.ref(None, list_species=True)').\n"
"'gget ref --list_species' -> lists out all available species (Python: 'gget.ref(None, list_species=True)').\n"
"Combine with `release` argument to define specific Ensembl release (default: latest).\n"
)

## Get GTF link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/gtf/{species}/"
url = database + f"release-{ENS_rel}/gtf/{species}/"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down Expand Up @@ -160,13 +152,13 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
gtf_date_size = nones[i * 2 + 1]

gtf_url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/gtf/{species}/{gtf_str['href']}"
gtf_url = database + f"release-{ENS_rel}/gtf/{species}/{gtf_str['href']}"

gtf_date = gtf_date_size.strip().split(" ")[0]
gtf_size = gtf_date_size.strip().split(" ")[-1]

## Get cDNA FASTA link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/cdna"
url = database + f"release-{ENS_rel}/fasta/{species}/cdna"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down Expand Up @@ -195,15 +187,13 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
cdna_date_size = nones[i * 2 + 1]

cdna_url = (
ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/cdna/{cdna_str['href']}"
)
cdna_url = database + f"release-{ENS_rel}/fasta/{species}/cdna/{cdna_str['href']}"

cdna_date = cdna_date_size.strip().split(" ")[0]
cdna_size = cdna_date_size.strip().split(" ")[-1]

## Get DNA FASTA link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/dna"
url = database + f"release-{ENS_rel}/fasta/{species}/dna"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down Expand Up @@ -242,9 +232,7 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# Get date from non-assigned values (since there are twice as many, 2x and +1 to move from string to date)
dna_date_size = nones[i * 2 + 1]

dna_url = (
ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/dna/{dna_str['href']}"
)
dna_url = database + f"release-{ENS_rel}/fasta/{species}/dna/{dna_str['href']}"

dna_date = dna_date_size.strip().split(" ")[0]
dna_size = dna_date_size.strip().split(" ")[-1]
Expand All @@ -253,7 +241,7 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
dna_size = dna_size.strip()

## Get CDS FASTA link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/cds"
url = database + f"release-{ENS_rel}/fasta/{species}/cds"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down Expand Up @@ -282,52 +270,52 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
cds_date_size = nones[i * 2 + 1]

cds_url = (
ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/cds/{cds_str['href']}"
)
cds_url = database + f"release-{ENS_rel}/fasta/{species}/cds/{cds_str['href']}"

cds_date = cds_date_size.strip().split(" ")[0]
cds_size = cds_date_size.strip().split(" ")[-1]

## Get ncRNA FASTA link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/ncrna"
## Get ncRNA FASTA link for this species and release (if available)
url = database + f"release-{ENS_rel}/fasta/{species}/ncrna"
html = requests.get(url)

# Raise error if status code not "OK" Response
if html.status_code != 200:
raise RuntimeError(
f"HTTP response status code {html.status_code}. Please try again.\n"
)

soup = BeautifulSoup(html.text, "html.parser")

# The url can be found under an <a> object tag in the html,
# but the date and size do not have an object tag (element=None)
nones = []
a_elements = []
pre = soup.find("pre")
for element in pre.descendants:
if element.name == "a":
a_elements.append(element)
elif element.name != "a":
nones.append(element)
# If ncRNA data is not available, HTML requests returns an error code (!= 200)
if html.status_code == 200:
soup = BeautifulSoup(html.text, "html.parser")

# The url can be found under an <a> object tag in the html,
# but the date and size do not have an object tag (element=None)
nones = []
a_elements = []
pre = soup.find("pre")
for element in pre.descendants:
if element.name == "a":
a_elements.append(element)
elif element.name != "a":
nones.append(element)

# Find the <a> element containing the url
for i, string in enumerate(a_elements):
if ".ncrna.fa" in string["href"]:
ncrna_str = string
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
ncrna_date_size = nones[i * 2 + 1]

# Find the <a> element containing the url
for i, string in enumerate(a_elements):
if ".ncrna.fa" in string["href"]:
ncrna_str = string
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
ncrna_date_size = nones[i * 2 + 1]
ncrna_url = (
database + f"release-{ENS_rel}/fasta/{species}/ncrna/{ncrna_str['href']}"
)

ncrna_url = (
ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/ncrna/{ncrna_str['href']}"
)
ncrna_date = ncrna_date_size.strip().split(" ")[0]
ncrna_size = ncrna_date_size.strip().split(" ")[-1]

ncrna_date = ncrna_date_size.strip().split(" ")[0]
ncrna_size = ncrna_date_size.strip().split(" ")[-1]
# If the HTML request returned an error code here, I will assume that ncRNA data is not available
else:
ncrna_url = ""
ncrna_date = " "
ncrna_size = ""

## Get pep FASTA link for this species and release
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/pep"
url = database + f"release-{ENS_rel}/fasta/{species}/pep"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down Expand Up @@ -356,9 +344,7 @@ def ref(species, which="all", release=None, ftp=False, save=False, list_species=
# Get release date and time from <None> elements (since there are twice as many, 2x and +1 to move from string to date)
pep_date_size = nones[i * 2 + 1]

pep_url = (
ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/{species}/pep/{pep_str['href']}"
)
pep_url = database + f"release-{ENS_rel}/fasta/{species}/pep/{pep_str['href']}"

pep_date = pep_date_size.strip().split(" ")[0]
pep_size = pep_date_size.strip().split(" ")[-1]
Expand Down
2 changes: 1 addition & 1 deletion gget/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def main():
required=False,
help=(
"""
List all available species.
List all available species from the Ensembl database for large genomes (not including plants/bacteria).
(Combine with `--release` to get the available species from a specific Ensembl release.)
"""
),
Expand Down
17 changes: 10 additions & 7 deletions gget/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,12 +409,14 @@ def rest_query(server, query, content_type):
return r.text


def find_latest_ens_rel():
def find_latest_ens_rel(database=ENSEMBL_FTP_URL):
"""
Returns the latest Ensembl release number.
Args:
- database Link to Ensembl database.
"""
url = ENSEMBL_FTP_URL
html = requests.get(url)
html = requests.get(database)

# Raise error if status code not "OK" Response
if html.status_code != 200:
Expand Down Expand Up @@ -480,18 +482,19 @@ def gget_species_options(release=None):
return databases


def ref_species_options(which, release=None):
def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
"""
Function to find all available species for gget ref.
Args:
- which Which type of FTP. Possible entries: 'dna', 'cdna', 'gtf'.
- database Link to Ensembl database.
- release Ensembl release for which available species should be fetched.
Returns list of available species.
"""
# Find latest Ensembl release
ENS_rel = find_latest_ens_rel()
ENS_rel = find_latest_ens_rel(database)

# If release != None, use user-defined Ensembl release
if release != None:
Expand All @@ -505,9 +508,9 @@ def ref_species_options(which, release=None):

# Find all available species for this release and FTP type
if which == "gtf":
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/gtf/"
url = database + f"release-{ENS_rel}/gtf/"
if which == "dna" or which == "cdna":
url = ENSEMBL_FTP_URL + f"release-{ENS_rel}/fasta/"
url = database + f"release-{ENS_rel}/fasta/"
html = requests.get(url)

# Raise error if status code not "OK" Response
Expand Down
Loading

0 comments on commit 114084f

Please sign in to comment.