-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_ucsc_genome_gtf_download_link.py
88 lines (74 loc) · 3.84 KB
/
get_ucsc_genome_gtf_download_link.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 13 22:21:33 2021
@author: liuh
"""
import urllib3
from bs4 import BeautifulSoup
import re
import copy
lastest_download = "https://hgdownload.soe.ucsc.edu/downloads.html"
http = urllib3.PoolManager()
r = http.request('GET', lastest_download)
soup = BeautifulSoup(r.data, 'html.parser')
base="https://hgdownload.soe.ucsc.edu/"
bigzip_links = soup.find_all('a', href = re.compile("bigZips/$"))
genome_name = [link['href'].split("/")[2] for link in bigzip_links]
genome_fa_pattern = [re.compile(r'{}.fa.gz'.format(name)) for name in genome_name]
genome_download_links = {}
for gn in genome_name:
genome_download_links.setdefault(gn, {})
for k in ["md5sum", "gtf", "genome"]:
genome_download_links[gn].setdefault(k, None)
for i, anchor in enumerate(bigzip_links):
bgzp_link = base + anchor['href']
#bgzp_link = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/"
req = http.request('GET', bgzp_link)
soup = BeautifulSoup(req.data, 'html.parser')
md5sum_m = soup.find('a', href = re.compile(r'md5sum.txt'))
gtf_m = soup.find('a', href = re.compile(r"^genes/$"))
genome_m = soup.find('a', href = genome_fa_pattern[i])
chrom_m = soup.find('a', href = re.compile(r"(chromFa.tar.gz|chromFa.zip)"))
sm_chrom_m = soup.find('a', href = re.compile(r"softmask.fa.gz"))
contig_scaffold_m = soup.find('a', href = re.compile(r"(contigFa.zip|scaffold.fa.gz)"))
if md5sum_m:
genome_download_links[genome_name[i]]["md5sum"] = bgzp_link + md5sum_m['href']
if gtf_m:
annotation_link = bgzp_link +gtf_m['href']
gtf_res = http.request('GET', annotation_link)
soup = BeautifulSoup(gtf_res.data, 'html.parser')
ens_gtf = soup.find('a', href =re.compile("ensGene"))
ref_gtf = soup.find('a', href =re.compile("refGene"))
ncbiRef_gtf = soup.find('a', href =re.compile("ncbiRef"))
## if a genome has multiple gtf annotion files, the priority of gtf to be downloaded is
## ncbiRefSeq.gtf, ensGene.gtf, followed by refGene.gtf
if ncbiRef_gtf:
genome_download_links[genome_name[i]]["gtf"] = annotation_link + ncbiRef_gtf['href']
elif ens_gtf:
genome_download_links[genome_name[i]]["gtf"] = annotation_link+ ens_gtf['href']
elif ref_gtf:
genome_download_links[genome_name[i]]["gtf"] = annotation_link + ref_gtf['href']
if genome_m:
genome_download_links[genome_name[i]]["genome"] = bgzp_link + genome_m['href']
elif chrom_m:
genome_download_links[genome_name[i]]["genome"] = bgzp_link + chrom_m['href']
elif sm_chrom_m:
genome_download_links[genome_name[i]]["genome"] = bgzp_link + sm_chrom_m['href']
elif contig_scaffold_m:
genome_download_links[genome_name[i]]["genome"] = bgzp_link + contig_scaffold_m['href']
genome_download_links_copy = copy.deepcopy(genome_download_links)
# some species have no single genome.fa.gz file but a tar ball containing per chromosome fasta sequences
# some others have no genes/*.gtf annotation file
# remove genomes which have no a gtf annotation file and select the single genome.fa.gz file if possible,
# otherwise choose the tar ball (Attention needed for furthur porcessing: uncompress first then concatenate)
for genome_name in genome_download_links_copy:
if not genome_download_links_copy[genome_name]["gtf"] or not genome_download_links_copy[genome_name]["genome"]:
del genome_download_links[genome_name]
genome_download_links ## 146 genome assemblies
# final download links for each species
# with open("test4.txt", "w") as t_fh:
# for g in sorted(genome_download_links):
# t_fh.write(str(g)+ "\t")
# for k in sorted(genome_download_links[g]):
# t_fh.write(str(genome_download_links[g][k])+ "\t")
# t_fh.write("\n")