-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUniprotIdRetrieval.py
146 lines (110 loc) · 4.56 KB
/
UniprotIdRetrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!python2
# coding: utf-8
"""
UNIPROT ID RETRIEVAL
Downloads protein sequences from Uniprot, in the desired output format, based on the given ids (either passed as
arguments or identified in a file)
DEPENDENCIES:
- Python 2.7
- requests
HOW TO RUN:
Through the command line:
UniprotIdRetrieval.py [-h] (-f <file> | -i <id ...>) [-fo {fasta, gff, tab, txt, rdf, xml}]
- '-h, --help' shows the help text
- '-f, --file' takes a file path to search for ids
- '-i, --ids' takes 1 or more ids
- '-fo, --format' defines the output format of downloaded files (fasta, gff, tab, txt, rdf or xml),
defaults to 'fasta'
"""
import argparse
import re
import requests
import time
__author__ = 'Pedro HC David, https://github.com/Kronopt'
__credits__ = ['Pedro HC David']
__version__ = '2.0'
__status__ = 'Finished'
# http://www.uniprot.org/help/accession_numbers
UNIPROTREGEX = ("[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9][A-Z][A-Z0-9]{2}[0-9]"
+ "|[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]|[OPQ][0-9][A-Z0-9]{3}[0-9]")
def extract_uniprot_ids(file_name):
"""
Identifies valid uniprot accession numbers using a regular expression defined on the uniprot website
Parameters
----------
file_name: str
Path/file name of the file to parse
Returns
-------
Set with the indentified uniprot accession numbers
"""
uniprot_regex = re.compile(UNIPROTREGEX)
uniprot_ids = []
try:
with open(file_name, "rb") as file_with_ids:
for line in file_with_ids:
uniprot_ids.extend(uniprot_regex.findall(line))
# Uniprot ids can only be 6 or 10 characters in size
uniprot_ids = filter(lambda x: len(x) in [6, 10], uniprot_ids)
except IOError, e:
print 'Error with "' + e.filename + '" file: ' + e.strerror
finally:
return set(uniprot_ids)
def retrieve_sequences(ids_, output_format):
"""
Searches for a protein sequence for each id and downloads it (if found) in the desired output format
Parameters
----------
ids_: set / list
Uniprot ids
output_format: str
Output format
Returns
-------
Fetching of each protein sequence file (on the desired output format) for each id
"""
if len(ids_) > 0:
print '\n' + output_format.upper() + ' sequences'
for i in sorted(ids_):
print i + ':',
resource = i + '.' + output_format
# Uniprot webservice
sequence_file = requests.get('http://www.uniprot.org/uniprot/' + resource)
# If response is empty
if len(sequence_file.text) == 0:
print 'not available in .' + output_format + ' or does not exist'
continue
# http not 200
elif sequence_file.status_code != 200:
print('http error ' + str(sequence_file.status_code))
# If response is html, then it's invalid
else:
html = False
for line in sequence_file.iter_lines():
if '<!DOCTYPE html' in line:
print 'not available in .' + output_format + ' or does not exist'
html = True
break
if html:
continue
with open(resource, "wb") as file_name:
[file_name.write(line + '\n') for line in sequence_file.iter_lines()]
print 'ok'
time.sleep(0.33) # follow max of 3 requests per second rule of Uniprot
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Uniprot id retrieval tool. Searches Uniprot for protein sequences and'
' downloads them. Either uses the ids passed as arguments (-i, --ids) '
'OR ids found in a file (-f, --file)')
file_or_ids = parser.add_mutually_exclusive_group(required=True)
file_or_ids.add_argument('-f', '--file', metavar='<file>', help='file path')
file_or_ids.add_argument('-i', '--ids', nargs='+', metavar='<id>', help='uniprot ids')
parser.add_argument('-fo', '--format',
choices=['fasta', 'gff', 'tab', 'txt', 'rdf', 'xml'],
default='fasta',
help='output format (defaults to fasta)')
parser = parser.parse_args()
if parser.ids is None:
ids = extract_uniprot_ids(parser.file)
retrieve_sequences(ids, parser.format)
else:
retrieve_sequences(set(parser.ids), parser.format)