-
Notifications
You must be signed in to change notification settings - Fork 11
/
ngs.sh
executable file
·319 lines (264 loc) · 9.84 KB
/
ngs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/bin/bash
# Copyright (c) 2012-2014 Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
##########################################################################################
# Adding new processing step:
# 1. create a shell script containing the following elements ("COMMAND" should be a unique command name")
# section: SOURCE MODULES HERE
#
# 2. ngsUsage_COMMAND -- one line command usange and description
# section: ADD MODULE USAGE HERE
#
# 3. ngsHelp_COMMAND -- expanded command help
# section: ADD MODULE HELP HERE
#
# 4. ngsArgs_COMMAND() -- function for processing command line arguments
# section: ADD MODULE ARGUMENT FUNCTION HERE
#
# 5. ngsCmd_COMMAND() -- function that performs command operation
# section: ADD MODULE COMMAND FUNCTIONS HERE
##########################################################################################
VERSION=2.2
# each module should output a tab-delimited list of file and program
# version information. This file should have two lines, the first line
# being header information and the second line being the versions. The
# prnVersion() command should be used to generate this file. This file
# will live in the respective module subdirectory (ie
# $SAMPLE/MODULE/VERSION)
VERSION_FILE="versions"
DEBUG=false # disable commands when true, use to see what commands would be run.
# output every line of code to the console when running
#if "$DEBUG"; then set -x; fi
###############################################################################################
# ****************************** BEGIN USER DEFINED VARIABLES *********************************
# this is the location of the demultiplexed files, with each sample in
# a separate subdirectory named with the sample ID. This is often a
# symbolic link to the appropriate raw directory in the data
# repository.
RAW=raw
# this is the place where analyzed data will be stored. Each sample
# will be put into a separate subdirectory. This is often a symbolic
# link to the appropriate analysis directory in the data repository.
ANALYZED=analyzed
# location of genomic databases and library files
REPO_LOCATION=/lab/repo/resources
BOWTIE_REPO=$REPO_LOCATION/bowtie
RUM_REPO=$REPO_LOCATION/rum2
STAR_REPO=$REPO_LOCATION/star
HTSEQ_REPO=$REPO_LOCATION/htseq
SNP_REPO=$REPO_LOCATION/snp
verse_REPO=$REPO_LOCATION/verse
# list of all modules available to be run.
MODULES=( "HELP" "INIT" "FASTQC" "BLAST" "RMDUP" "BOWTIE" "TRIM" "STAR" "RUM" "RUMSTATUS" "POST" "BLASTDB" "HTSEQ" "VERSE" "SNP" "SPADES" "BARCODE" "RSYNC" "STATS" "PIPELINE" "VERSION" )
# ****************************** END USER DEFINED VARIABLES ***********************************
###############################################################################################
###############################################################################################
# ****************************** GLOBAL DEFINED VARIABLES AND OPTIONS *************************
# when modules are loaded, they add their usage to this variable.
NGS_USAGE=""
# default is paired-end.
SE=false
# species is used to determine what library files to use.
SPECIES=""
# number of cpu/core to use in multi-processing modules
NUMCPU=""
# name of sample
SAMPLE=""
# read length. If paired end then this is the length of one mate
# (default = 100). This is used in BLAST, STAR and PIPELINE.
READ_LENGTH="100"
# cause the application to crash if any command generates an
# error. This is equivalent to the "-e" flag.
set -o errexit
# cause an error to happen if trying to use an unset variable. We
# don't use the -u option when launching bash above as that may cause
# bash initialization scripts to error.
set -o nounset
# make comparisons case insensitive. Need to use [[ and ]] in if
# conditionals, rather than [ and ].
shopt -s nocasematch
# get OS name.
OS_VERSION=$(uname)
# If OS isn't "Darwin" (Mac) then assume "Linux" (RedHat /
# Centos). Other OS versions could be added here.
case ${OS_VERSION} in
Darwin)
# Grep on the Mac (and likely BSD) does not have a "-P" option
# ("perl-regexp"). We use the "-P" option on Linux at various
# places.
GREPP="egrep"
;;
*)
GREPP="grep -P"
;;
esac
# ****************************** END GLOBAL DEFINED VARIABLES AND OPTIONS *********************
###############################################################################################
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# LOAD MODULES
for module in "${MODULES[@]}"; do
source ngs_${module}.sh
done
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
###############################################################################################
# HELPER FUNCTIONS
# print the help text for a module.
printHelp() {
local NOT_FOUND=true
for module in "${MODULES[@]}"; do
if [[ "$module" == "$1" ]]; then
ngsHelp_$1
NOT_FOUND=false
break
fi
done
if $NOT_FOUND; then echo -e $NGS_USAGE; fi
exit 1
}
# output version information to the module subdirectory. We assume the
# output directory already exists ($SAMPLE/MODULE). We expect three
# arguments: module, header, values.
prnVersion() {
if [[ $# -ne 3 ]]; then prnError "prnVersion() requires 3 arguments. Only received $#. arguments"; fi
# we can't rely on COMMAND to know the module calling this
# function since COMMAND might be pipeline.
outFile="$SAMPLE/$1/$VERSION_FILE"
# append pipeline version number
header="pipeline\t$2"
values="$VERSION\t$3"
# we intentionally write over the previous file, if it exists
echo -e $header > $outFile
echo -e $values >> $outFile
}
# this will uniformily format the output that is put into the JOURNAL
# file. The following bash command can be used to strip off the time
# stamp and generate a executable bash file:
# cat analysis.log | awk -F\\t '{print \$2}'
prnCmd() {
if [[ $1 == *"# BEGIN:"* ]]; then
# copy to console
echo
echo "##################################################################"
echo $1
echo -ne `date`
echo -ne " "
echo $SAMPLE
echo "##################################################################"
echo -ne `date` >> $JOURNAL
echo -ne "\t" >> $JOURNAL
echo "##################################################################" >> $JOURNAL
fi
echo -ne `date` >> $JOURNAL
echo -ne "\t" >> $JOURNAL
echo -ne $1 >> $JOURNAL
echo >> $JOURNAL
if [[ $1 == *"# FINISHED"* ]]; then
echo -ne `date` >> $JOURNAL
echo -ne "\t" >> $JOURNAL
echo "##################################################################" >> $JOURNAL
# insert extra line between sections
echo >> $JOURNAL
# copy to console
echo
echo "##################################################################"
echo $1
echo -ne `date`
echo -ne " "
echo $SAMPLE
echo "##################################################################"
fi
}
# print warning to console
prnWarning() {
echo -e "\n************************************************" >& 2
echo -ne "WARNING: " >& 2
echo -e `date` >& 2
echo -e $1 >& 2
}
# exit on error
prnError() {
echo -e "\n************************************************" >& 2
echo -ne "ERROR: " >& 2
echo -e `date` >& 2
echo -e $1 >& 2
exit 1
}
###############################################################################################
# PROCESS COMMAND ARGUMENTS
# if no args then print out usage
if [ $# -lt 1 ]; then
echo -e $NGS_USAGE
exit 0
fi
# get command as upper case
COMMAND=$( echo $1 | tr "[a-z]" "[A-Z]" )
shift # shift removes $1 (ie COMMAND) from the argument list
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# PROCESS MODULE'S ARGUMENT FUNCTION
ngsArgs_${COMMAND} $@
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# if we've gotten to this point and $SAMPLE is not set, then something
# went wrong and abort
if [[ -z "$SAMPLE" ]]; then
echo -e "\n************************************************"
echo -ne "ERROR: "
echo -e `date`
echo "Error processing command arguments."
echo -e $NGS_USAGE
exit 1
fi
# remove trailing "/" from $SAMPLE, if present
SAMPLE="${SAMPLE%/}"
# adjust the name of the VERSION file, now that we know the SAMPLE
VERSION_FILE="$SAMPLE.${VERSION_FILE}"
# create output directory
if [[ ! -d $SAMPLE ]]; then
if [[ $COMMAND == "STATS" ]]; then
prnError "Cannot stat a sample that doesn't exist"
else
mkdir $SAMPLE
fi
fi
# create log directory. This needs to happen prior to using the
# prnCmd() function, so we can create the output file ($JOURNAL) that
# is used by prnCmd. This happens even during debugging because this
# directory is where the $JOURNAL file is located by default.
if [[ ! -d $SAMPLE/log ]]; then
mkdir $SAMPLE/log
fi
# set the location of the JOURNAL file, now that we know the SAMPLE
# and COMMAND. We use a date-time stamp plus the module name. The file
# will be located in the $SAMPLE/log directory. We don't use a ":" in
# the timestamp because Mac file systems don't allow colons in file
# names.
JOURNAL="$SAMPLE/log/$(date +%Y-%m-%d_%H-%M).$COMMAND.log"
# If journal file already exists, then regenerate the filename using
# seconds.
if [ -f $JOURNAL ]; then
JOURNAL="$SAMPLE/log/$(date +%Y-%m-%d_%H-%M-%S).$COMMAND.log"
fi
# STATS shouldn't write anything to the log file
if [[ "$COMMAND" != "stats" ]]; then
# log version and run-time information
if $DEBUG; then prnCmd "# DEBUG MODE"; fi
_cmd=`basename $0`
_args=`echo $@`
prnCmd "# COMMAND: $_cmd $COMMAND $_args"
fi
###############################################################################################
# RUN COMMANDS
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# RUN MODULE'S COMMAND FUNCTION
ngsCmd_${COMMAND}
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@