-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMIP_1.py
178 lines (161 loc) · 6.09 KB
/
MIP_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
@author: Sakshi
"""
from Bio.SeqUtils import GC
from Bio import SeqIO
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import re
import pandas as pd
import os
import sys
import configparser
fname = sys.argv[1]
#read in the length of the target region and the length of the extension and ligation arms
config = configparser.ConfigParser()
config.read_file(open(r'config.txt'))
rep = config.get('My Section', 'Number of repeats',fallback='No Value Entered')
armlengthel = config.get('My Section', 'length of MIP ext and lig arm',fallback='No Value Entered')
targetlen = config.get('My Section', 'length of target region',fallback='No Value Entered')
alllen=int(targetlen)+int(armlengthel)+int(armlengthel)
#check definiton line for organism from excel file Organism Dictionary
#following this if organism not found ask for user input to identify the Organism
#full scientific name of the organism should be entered. Ex. Streptomyces lavendulae
df=pd.read_excel('Organism Dictionary.xlsx', index_col=None)
org=df['Organism'].tolist()
accid=[]
seqs=[]
#Read in the sequences and definition lines from the FASTA file
with open(fname+".fasta") as fh:
for line in fh:
if line.startswith(">"):
accid.append(line)
fasta_sequences = SeqIO.parse(open(fname+".fasta"),'fasta')
for fasta in fasta_sequences:
name, sequence = fasta.id, str(fasta.seq)
seqs.append(sequence)
bad_chars = [';', ':', '!', "*", '_']
cnt=0
cntno=0
org_notfound=[]
organism=[]
#Search for the organism against the dictionary file
for i in accid:
for a in bad_chars:
i=i.replace(a," ")
#check for any special characters and remove them
if any(o in i for o in org):
matched = [o for o in org if o in i]
organism.append(matched[0])
cnt+=1
else:
cntno+=1
org_notfound.append(i)
if cntno != 0:
for i in org_notfound:
print("An organism could not be detected in the following definition line:")
print(i)
print("Please enter the full scientific name of the organism:")
org1=input()
organism.append(org1)
arm1=[]
arm2=[]
v4seq=[]
temp=[]
gccontent=[]
temp1=[]
gccontent1=[]
stretch=[]
stretch1=[]
# v4 region is 576-682 numbering based on the E. coli system of nomenclature (Brosius et al., 1978)
target=[]
mseq=[]
revarm1=[]
revarm2=[]
revtarget=[]
accid2=[]
organism2=[]
extstart=[]
extend=[]
ligationstart=[]
ligationend=[]
#generate arms for both the forward and reverse strand
for s in seqs:
indi=seqs.index(s)
seqmain=s
acid=accid[indi]
org2=organism[indi]
out =[]
for i in range(len(s)):
for j in range(i + 1, len(s) + 1):
if (len(s[i:j]) == alllen):
out.append(s[i:j])
extstart.append(i)
extend.append(i+int(armlengthel))
ligationstart.append(j-int(armlengthel))
ligationend.append(j)
lenarm=int(armlengthel)
for g in out:
#EDIT HERE
organism2.append(org2)
accid2.append(acid)
mseq.append(seqmain)
a1=Seq(g[:lenarm])
a2=Seq(g[-lenarm:])
#EDIT HERE
arm1.append(a1.complement())
arm2.append(a2.reverse_complement())
target.append(g[lenarm:-lenarm])
srev2=a2.complement()
revarm1.append(a1)
revarm2.append(srev2.reverse_complement())
temprev=[]
gccontentrev=[]
stretch1rev=[]
temprev1=[]
gccontentrev1=[]
stretch1rev1=[]
#get temperature, gc content, and check for polyNs
for a in arm1:
myseq = a
inn=arm1.index(a)
temp.append('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN2))
gccontent.append(GC(myseq))
a=str(a)
if re.search(r"G{rep,}", a) or re.search(r"A{rep,}", a) or re.search(r"C{rep,}", a) or re.search(r"T{rep,}", a):
stretch.append('yes')
else:
stretch.append('no')
for b in arm2:
myseq = b
temp1.append('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN2))
gccontent1.append(GC(myseq))
b=str(b)
if re.search(r"G{rep,}", b) or re.search(r"A{rep,}", b) or re.search(r"C{rep,}", b) or re.search(r"T{rep,}", b):
stretch1.append('yes')
else:
stretch1.append('no')
for c in revarm1:
myseq = c
temprev1.append('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN2))
gccontentrev1.append(GC(myseq))
c=str(c)
if re.search(r"G{rep,}", c) or re.search(r"A{rep,}", c) or re.search(r"C{rep,}", c) or re.search(r"T{rep,}", c):
stretch1rev1.append('yes')
else:
stretch1rev1.append('no')
for d in revarm2:
myseq = d
temprev.append('%0.2f' % mt.Tm_NN(myseq, nn_table=mt.DNA_NN2))
gccontentrev.append(GC(myseq))
d=str(d)
if re.search(r"G{rep,}", d) or re.search(r"A{rep,}", d) or re.search(r"C{rep,}", d) or re.search(r"T{rep,}", d):
stretch1rev.append('yes')
else:
stretch1rev.append('no')
for i in target:
revtarget.append(Seq(i).complement())
#output all MIPs possible for the FASTA file input by the user
df=pd.DataFrame({"Def Line":pd.Series(accid2), "Main Sequence":pd.Series(mseq),"Target region":pd.Series(target),"Ligation Arm":pd.Series(arm1),"TM 1":pd.Series(temp),"GC content 1":pd.Series(gccontent),"Continuous stretch":pd.Series(stretch),"Extension Arm":pd.Series(arm2),"TM 2":pd.Series(temp1),"GC content 2":pd.Series(gccontent1),"Continuous stretch 2":pd.Series(stretch1),"Reverse Strand Target region":pd.Series(revtarget),"Ligation Arm(Rev)":pd.Series(revarm1),"TM Rev":pd.Series(temprev1),"GC content Rev":pd.Series(gccontentrev1),"Continuous stretch Rev":pd.Series(stretch1rev1),"Extension Arm Rev":pd.Series(revarm2),"TM 2 Rev":pd.Series(temprev),"GC content 2 Rev":pd.Series(gccontentrev),"Continuous stretch 2 Rev":pd.Series(stretch1rev),"Extension Start":pd.Series(extstart),"Extension End":pd.Series(extend),"Ligation Start":pd.Series(ligationstart), "Ligation End":pd.Series(ligationend),"Organism":pd.Series(organism2)})
df.to_excel(fname+"_MIPs.xlsx", index=False)