-
-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathvector_strip.py
131 lines (111 loc) · 4.66 KB
/
vector_strip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Copyright 2022 Forrest Sheng Bao forrestbao.github.io
# This script removes adapters in RNA-seqs
# To see usage, go to test() function at the end
import typing
import functools
import multiprocessing
import pandas
import funix
@funix.funix(disable=True)
def remove_3_prime_adapter_one(
sRNA: str = "AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTT",
adapter_3_prime: str = "TCGTATGCCGTCTTCTGCTT",
minimal_match_length: int = 0,
verbose_level: int = 1,
):
"""Use reverse shift to match the sRNA with the 3' adapter
minimal_match_length: the sRNA needs at least that much matchness
with the 3' adapter to be considered valid.
verbose_level = 0, 1, 2. O means no details. 1: show final alignment per sequence. 2: show aligment at every step.
Example:
sRNA = AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTT
adapter_3_prime = TCGTATGCCGTCTTCTGCTT
"""
return_code = -1 # 1: found. 0: match to short. -1: no match at all.
return_seq = "no match at all"
# check whether the 3' adapter is fully enclosed in sRNA
i = sRNA.find(adapter_3_prime)
if i > -1:
return_code, return_seq = 1, sRNA[:i]
print(sRNA, "\n", " " * (i - 1) + adapter_3_prime)
else:
for i in range(len(adapter_3_prime)):
sRNA_tail = sRNA[-1 * i :]
adapter_3_prime_head = adapter_3_prime[:i]
if verbose_level == 2:
print(sRNA, "\n", adapter_3_prime_head.rjust(len(sRNA) - 1))
if sRNA_tail == adapter_3_prime_head:
if i + 1 > minimal_match_length:
return_code = 1
return_seq = sRNA[: -1 * i]
if verbose_level == 1:
print(sRNA, "\n", adapter_3_prime_head.rjust(len(sRNA) - 1))
break
else:
return_code, return_seq = 0, f"match <{minimal_match_length} nts"
return return_code, return_seq
@funix.funix(disable=True)
def remove_3_prime_adapter_vectorized(
sRNAs: typing.List[str] = [
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGC", # shorter than full 3' adapter
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTT", # full 3' adapter
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTTCTGAATTAATT", # additional seq after 3' adapter,
"AAGCTCAGGAGGGATAGCGCCTCGTATG", # <8 nt io 3' adapter
"AAGCTCAGGAGGGATAGCGCCGTATG", # no match at all
],
adapter_3_prime: str = "TCGTATGCCGTCTTCTGCTT",
minimal_match_length: int = 8,
verbose_level: int = 1,
):
# print (adapter_3_prime)
func_remove_3_prime_adapter_partial = functools.partial(
remove_3_prime_adapter_one,
adapter_3_prime=adapter_3_prime,
minimal_match_length=minimal_match_length,
verbose_level=1,
)
# with multiprocessing.Pool() as pool:
# result = pool.map(func_remove_3_prime_adapter_partial, sRNAs)
# BUG: Not a priority. Funix cannot work with multiprocessing. The two lines above will throw an error. But there is no problem to run this script locally.
# Comment: I won't fix multiprocessing issue now.
result = list(map(func_remove_3_prime_adapter_partial, sRNAs))
returns = list(zip(*result))
# print (returns)
[return_codes, return_seqs] = returns
return return_codes, return_seqs
@funix.funix(disable=True)
def test():
return_codes, return_seqs = remove_3_prime_adapter_vectorized() # all default value
for code, seq in zip(return_codes, return_seqs):
print(code, ":", seq)
@funix.funix(
description="Remove 3' prime adapter from the end of an RNA-seq",
)
def remove_3_prime_adapter(
# adapter_3_prime: str="TCGTATGCCGTCTTCTGCTT",
adapter_3_prime: str = "TCGTA",
minimal_match_length: int = 8,
sRNAs: pandas.DataFrame = pandas.DataFrame(
{
"sRNAs": [
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGC", # shorter than full 3' adapter
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTT", # full 3' adapter
# additional seq after 3' adapter,
"AAGCTCAGGAGGGATAGCGCCTCGTATGCCGTCTTCTGCTTCTGAATTAATT",
"AAGCTCAGGAGGGATAGCGCCTCGTATG", # <8 nt io 3' adapter
"AAGCTCAGGAGGGATAGCGCCGTATG", # no match at all
]
}
),
# ) -> pandera.typing.DataFrame[OutputSchema]:
) -> pandas.DataFrame:
return_codes, return_seqs = remove_3_prime_adapter_vectorized(
sRNAs=sRNAs["sRNAs"].tolist(),
adapter_3_prime=adapter_3_prime,
minimal_match_length=minimal_match_length,
)
return pandas.DataFrame(
{"original sRNA": sRNAs["sRNAs"], "adapter removed": list(return_seqs)}
)
if __name__ == "__main__":
test()