-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGet_SexSpef_Kmers.sh
147 lines (103 loc) · 3.29 KB
/
Get_SexSpef_Kmers.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
#SBATCH -t 5:0:0
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --mem=120gb
#SBATCH --share
#SBATCH --mail-user=
#SBATCH --mail-type=ALL
. ~/.profile
module load python
# This script calls and uses sex_specific_kmers.py
# Assign your Jellyfish Dump output file to a variable
# This simplifies the script - you do not have to replace each file name throughout to reuse
FEMALE=
MALE=
# Sort Dump files to reduce memory use for later steps
# Dump files must be in column format not fasta
sort -T file_location --parallel=8 -k1,1 -o ${FEMALE}.sorted ${FEMALE}
wait
sort -T file_location --parallel=8 -k1,1 -o ${MALE}.sorted ${MALE}
wait
# Split files into chunks < 4Gb to avoid exceeding memory limit
# This is dependent on your available computational power, you may not need to split
# If you have very large files, divide into more pieces (change the -n, and then add additional mv commands)
# need to do -n l/# instead of -n # because -n # messes the files up and throws a line error
split -n l/4 ${MALE}.sorted Male
wait
split -n l/4 ${FEMALE}.sorted Female
wait
#rename the split files
mv Maleaa ${MALE}.pt1
mv Maleab ${MALE}.pt2
mv Maleac ${MALE}.pt3
mv Malead ${MALE}.pt4
wait
mv Femaleaa ${FEMALE}.pt1
mv Femaleab ${FEMALE}.pt2
mv Femaleac ${FEMALE}.pt3
mv Femalead ${FEMALE}.pt4
wait
# Now that the dump files are in reasonable sized run the script to compare male and female kmers in each subset
python sex_specific_kmers.py -m ${MALE}.pt1 -f ${FEMALE}.pt1 -M ${MALE}_spefpt1 -F ${FEMALE}_spefpt1
wait
python sex_specific_kmers.py -m ${MALE}.pt2 -f ${FEMALE}.pt2 -M ${MALE}_spefpt2 -F ${FEMALE}_spefpt2
wait
python sex_specific_kmers.py -m ${MALE}.pt3 -f ${FEMALE}.pt3 -M ${MALE}_spefpt3 -F ${FEMALE}_spefpt3
wait
python sex_specific_kmers.py -m ${MALE}.pt4 -f ${FEMALE}.pt4 -M ${MALE}_spefpt4 -F ${FEMALE}_spefpt4
wait
# merge together all of the resulting files for males and all the files for females
cat ${MALE}_spefpt1 ${MALE}_spefpt2 ${MALE}_spefpt3 ${MALE}_spefpt4 > ${MALE}_compiled
wait
cat ${FEMALE}_spefpt1 ${FEMALE}_spefpt2 ${FEMALE}_spefpt3 ${FEMALE}_spefpt4 > ${FEMALE}_compiled
wait
# Run the python script a final time on the merged files
# This ensures that any matches that might have been missed because of the splits get captured and removed
python sex_specific_kmers.py -m ${MALE}_compiled -f ${FEMALE}_compiled -M ${MALE}_FINAL -F ${FEMALE}_FINAL
# Final output file should be a male file and a female file, each with two columns - the sex-unique kmer and its frequency
wait
# clean up all the intermediate files
rm ${MALE}.pt1
rm ${MALE}.pt2
rm ${MALE}.pt3
rm ${MALE}.pt4
rm ${MALE}.pt5
rm ${MALE}.pt6
rm ${MALE}.pt7
rm ${MALE}.pt8
rm ${MALE}.pt9
rm ${MALE}.pt10
rm ${FEMALE}.pt1
rm ${FEMALE}.pt2
rm ${FEMALE}.pt3
rm ${FEMALE}.pt4
rm ${FEMALE}.pt5
rm ${FEMALE}.pt6
rm ${FEMALE}.pt7
rm ${FEMALE}.pt8
rm ${FEMALE}.pt9
rm ${FEMALE}.pt10
wait
rm ${MALE}_spefpt1
rm ${MALE}_spefpt2
rm ${MALE}_spefpt3
rm ${MALE}_spefpt4
rm ${MALE}_spefpt5
rm ${MALE}_spefpt6
rm ${MALE}_spefpt7
rm ${MALE}_spefpt8
rm ${MALE}_spefpt9
rm ${MALE}_spefpt10
rm ${MALE}_compiled
rm ${FEMALE}_spefpt1
rm ${FEMALE}_spefpt2
rm ${FEMALE}_spefpt3
rm ${FEMALE}_spefpt4
rm ${FEMALE}_spefpt5
rm ${FEMALE}_spefpt6
rm ${FEMALE}_spefpt7
rm ${FEMALE}_spefpt8
rm ${FEMALE}_spefpt9
rm ${FEMALE}_spefpt10
rm ${FEMALE}_compiled