-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeletion_create_input.sh
executable file
·121 lines (83 loc) · 5.2 KB
/
deletion_create_input.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#! /bin/bash
# usage deletion_create_input.sh RM_insertions_TSD_strands
source ./parameterfile_Ref.init
# source ~/CorrectHet/reGenotypeTE_del/reGenotypeTE/parameterfile_Ref.init
# find the 1000 bp up and downstream of the TE
# change delimiter (IFS) to new line.
IFS_BAK=$IFS
IFS=$'\n'
insertions=$(cat $1)
for locus in $insertions
do
loc=$(echo "$locus" | awk '{print $1}')
left=$(echo "$locus" | awk '{print $2}')
right=$(echo "$locus" | awk '{print $3}')
TSD=$(echo "$locus" | awk '{print $4}')
strand=$(echo "$locus" | awk '{print $5}')
# echo "$locus"
# echo "$loc"
# echo "$left"
# echo "$right"
# echo "$TSD"
if [[ $TSD == "noTSDs" ]]
then
left_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500),($2-1)}')
right_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$right") | awk '{print $1,$3,($3+500)}')
TE_loc=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500),($3+500)}') # the left and right TSD coordinates in case of "noTSDs" are actually the start and stop of the TE of the input file
# echo "$left_TE_del"
# echo "$right_TE_del"
# echo "$TE_loc"
# extract the sequence of them | change the refgenome path by a user input
leftsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$left_TE_del" | sed 's/ /\t/g'))
rightsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$right_TE_del" | sed 's/ /\t/g'))
noTE=$(paste -d "," <(echo "$leftsq") <(echo "$rightsq") | awk 'getline seq {print seq}' | sed $'s/,/\t/g')
TE=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$TE_loc" | sed 's/ /\t/g') | awk 'getline seq {print seq}')
# generate the table
paste <(echo "$left-$right" | sed 's/-/\t/g' | awk '{print $1"-"$4}') <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print $1"\t"$2}') <(echo ".") <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($2-500)}') <(echo "$right" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($3+500)}') <(echo "$TE") <(echo "$noTE")
elif [[ $TSD == *.* ]] # if TSDs have mismatches, will keep the 5' one for the non TE
then
left_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500),($2-1)}')
right_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$right") | awk '{print $1,$3,($3+500)}')
TE_loc=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500),($3+500)}') # the left and right TSD coordinates in case of "noTSDs" are actually the start and stop of the TE of the input file
# echo "TWO TSD !!!!!!!!!!"
# echo "$left_TE_del"
# echo "$right_TE_del"
# echo "$TE_loc"
# extract the sequence of them | change the refgenome path by a user input
leftsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$left_TE_del" | sed 's/ /\t/g'))
rightsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$right_TE_del" | sed 's/ /\t/g'))
noTE=$(paste -d "," <(echo "$leftsq") <(echo "$rightsq") | awk 'getline seq {print seq}' | sed $'s/,/\t/g')
leftTSD=$(echo "$TSD" | sed 's/\./\t/g' | awk '{print $1}')
rightTSD=$(echo "$TSD" | sed 's/\./\t/g' | awk '{print $2}')
if [[ $strand == "+" ]] # keep left if + ; keep right if -
then # keep left TSD
noTE=$(paste -d "," <(echo "$leftsq" | awk 'getline seq {print seq}') <(echo "$leftTSD") <(echo "$rightsq" | awk 'getline seq {print seq}') | sed $'s/,//g')
else # keep right TSD
noTE=$(paste -d "," <(echo "$leftsq" | awk 'getline seq {print seq}') <(echo "$rightTSD") <(echo "$rightsq" | awk 'getline seq {print seq}') | sed $'s/,//g')
fi
TE=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$TE_loc" | sed 's/ /\t/g') | awk 'getline seq {print seq}')
# generate the table
paste <(echo "$left-$right" | sed 's/-/\t/g' | awk '{print $1"-"$4}') <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print $1"\t"$2}') <(echo ".") <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($2-500)}') <(echo "$right" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($3+500)}') <(echo "$TE") <(echo "$noTE")
else
left_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500),($2-1)}')
right_TE_del=$(sed 's/:/\t/g;s/-/\t/g' <(echo "$right") | awk '{print $1,$3,($3+500)}')
TE_loc=$(paste -d "," <(sed 's/:/\t/g;s/-/\t/g' <(echo "$left") | awk '{print $1,($2-500)}') <(sed 's/:/\t/g;s/-/\t/g' <(echo "$right") | awk '{print ($3+500)}') | sed $'s/,/\t/g')
# echo "$left_TE_del"
# echo "$right_TE_del"
# echo "$TE_loc"
# extract the sequence of them | change the refgenome path by a user input
leftsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$left_TE_del" | sed 's/ /\t/g'))
rightsq=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$right_TE_del" | sed 's/ /\t/g'))
noTE=$(paste -d "," <(echo "$leftsq" | awk 'getline seq {print seq}') <(echo "$TSD") <(echo "$rightsq" | awk 'getline seq {print seq}') | sed $'s/,//g')
TE=$($BEDTOOLS getfasta -fi $GENOME -bed <(echo "$TE_loc" | sed 's/ /\t/g') | awk 'getline seq {print seq}')
#echo "$leftsq"
#echo "$rightsq"
#echo "$TE"
#echo "$noTE"
# generate the table
paste <(echo "$left-$right" | sed 's/-/\t/g' | awk '{print $1"-"$4}') <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print $1"\t"$2}') <(echo ".") <(echo "$left" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($2-500)}') <(echo "$right" | sed 's/:/\t/g;s/-/\t/g' | awk '{print ($3+500)}') <(echo "$TE") <(echo "$noTE")
fi
done
# return delimiter to previous value
IFS=$IFSq_BAK
IFS_BAK=