-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-longest-isoform.py
executable file
·153 lines (131 loc) · 4.66 KB
/
get-longest-isoform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/local/bin/python2.7
# -*- coding: utf-8 -*-
#
# get-longest-isoform.py
#==============================================================================
import argparse
import sys
import os
#==============================================================================
#Command line options==========================================================
#==============================================================================
parser = argparse.ArgumentParser()
parser.add_argument("genomes", type=str, nargs="+",
help="A list of input genomes in fasta format\
or an input folder")
parser.add_argument("outdir", type=str, default="./",
help="The output directory for filtered genomes\
[default: ./]")
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
#==============================================================================
def read_fasta(source):
SG = {}
c = 0
try:
with open(source, "r") as file:
for line in file.readlines():
if line[0] == ">":
name = line[1:].rstrip().split()[0]
# Correct for missing names in Macaca_fascicularis
if not name.startswith("ENS"):
c += 1
name = "peptide_"+str(c)
header = line[1:].rstrip()
SG[name] = ["", header]
else:
SG[name][0] += line.rstrip()
return SG
except IOError:
print "File does not exit!"
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def valid_chromosome(seq_header):
chrom = seq_header.split()[2].split(":")[2]
names = ["GL", # Callithrix
"ACF", # Callithrix
"AQ", # Chlorocebus
"KE", # Chlorocebus
"unplaced", # Gorilla
"cutchr", # Gorilla
"scaffold", # Microcebus, Tarsius
"GeneScaffold", # Microcebus, Tarsius
"ADF", # Nomascus
"AAQ", # Otolemur
"AAC", # Pan
"AHZ", # Papio
"JH", # Papio
"_random", # Pongo
"X",
"Y",
"A",
"a",
"b",
"B",
"MT"
]
if is_number(chrom):
return True
for n in names:
if chrom.startswith(n):
return True
# For Pongo
elif chrom.endswith(n):
return True
return False
def find_isoforms(genome):
isoforms = {}
for seq in genome:
if valid_chromosome(genome[seq][1]):
gname = genome[seq][1].split()[3].split(":")[-1]
if gname not in isoforms:
isoforms[gname] = [(seq, len(genome[seq][0]))]
else:
isoforms[gname].append((seq, len(genome[seq][0])))
return isoforms
def write_longest_isoform(isoforms, genome, fname):
with open(fname, "w") as outfile:
for seq in isoforms:
if len(isoforms[seq]) > 1:
m = max(zip(*isoforms[seq])[1])
longest = [p[0] for p in isoforms[seq] if p[1] is m][0]
header = ">" + genome[longest][1]+"\n"
outfile.write(header)
outfile.write(genome[longest][0]+"\n")
else:
longest = isoforms[seq][0][0]
header = ">" + genome[longest][1]+"\n"
outfile.write(header)
outfile.write(genome[longest][0]+"\n")
def list_files(current_dir):
file_list = []
for path, subdirs, files in os.walk(current_dir): # Walk directory tree
for name in files:
f = os.path.join(path, name)
file_list.append(f)
return file_list
#==============================================================================
def main():
if not os.path.isdir(args.outdir):
os.mkdir(args.outdir)
if len(args.genomes) == 1:
if os.path.isdir(args.genomes[0]):
infiles = list_files(args.genomes[0])
else:
infiles = args.genomes
else:
infiles = args.genomes
for genome in [f for f in infiles if f.endswith(".fa")]:
print genome
fname = ".".join(genome.split("/")[-1].split(".")[:-1]+["longest", "fa"])
genome = read_fasta(genome)
isoforms = find_isoforms(genome)
write_longest_isoform(isoforms, genome, args.outdir+"/"+fname)
if __name__ == "__main__":
main()