-
Notifications
You must be signed in to change notification settings - Fork 11
/
trimReads.py
executable file
·1405 lines (1182 loc) · 56.7 KB
/
trimReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# Copyright (c) 2013, Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
"""
by: S. Fisher, 2013
J. Shallcross, 2017
usage: trimReads.py [-h] [-t THREADS] [-C CHUNK_SIZE] [-v] [-p] [-m MIN_LEN] [-c3 NUM_CUT_3] [-c5 NUM_CUT_5] [-q PHRED] [-rN] [-rAT REMOVE_AT] [-c CONTAMINANTS_FA] -f FIRST_FQ [-r SECOND_FQ] -o OUTPUT_PREFIX
Contaminants file:
1. Sequences must be on a single line (ie can't contain line breaks)
2. No blank lines in file.
3. Sequence header is space delimited list of options. Option names and values should be separated by a colon.
Ex "> name:oligo size:10 windows:5"
4. Options:
name: this option is required for every sequence
method: there are three trimming methods (0 = full contaminant, 1 = mapped contaminant, 2 = identity based). (default 2)
size: size of k-mer (default 7)
windows: how many k-mers to seek. can not be larger than (contaminant length - k-mer). (default 6)
percentIdentity: percent identity threshold for trimming method 2 (0.0 < percentIdentity <= 1.0). (default 0.9)
totalIdentity: total identity threshold for trimming method 2. If this is less than the k-mer size then it will have no impact on trimming. (default 16)
REQUIRES Python 2.7 or later
"""
#------------------------------------------------------------------------------------------
# INITIALIZATIONS
#------------------------------------------------------------------------------------------
import sys, os, argparse
import multiprocessing
DEBUG = False
if DEBUG: print 'DEBUG MODE: ON'
VERSION = '0.7.4'
PAIRED = False
# indecies for the read set
HEADER = 'header'
SEQUENCE = 'seq'
QUALS = 'quals'
LENGTH = 'length'
TRIMMED = 'trimmed'
FLAGS = 'flags'
DISCARDED = 'discarded'
DEFAULT_WORKERS = 4
DEFAULT_END = 3
DEFAULT_MAX = 0
DEFAULT_METHOD = 2
DEFAULT_PERCENT_IDENTITY = .9
DEFAULT_SIZE_KMER = 7
DEFAULT_TOTAL_IDENTITY = 16
DEFAULT_WINDOWS_NUMBER = 6
# used by removePolyAT()
POLY_A = 'AAAAAAAAAA'
POLY_T = 'TTTTTTTTTT'
# Declare as global so each thread gets it's own copy w/o the overhead of passing as a param for each read (ie 4 copies instead of 40M)
clArgs = None
# print error and quit
def quitOnError(msg):
print 'ERROR:', msg
sys.exit(0)
# generate output file with every trim event. First and second
# reads will be combined in this output file.
#------------------------------------------------------------------------------------------
# FUNCTIONS FOR HANDLING READING AND WRITING OF READS
#------------------------------------------------------------------------------------------
def nextRead(inFile):
"""
Returns a set consisting of each line from the read, a length
indicator and a trimmed flag. Returns empty set if no more reads.
"""
line = inFile.readline().rstrip()
if len(line) == 0:
return {}
read = {}
try:
# load read values from fastq file
read[HEADER] = line
# make sure sequence is upper case for comparisons
read[SEQUENCE] = inFile.readline().rstrip().upper()
# this is the + which we can ignore for now
# XXX CONFIRM THAT THE + IN THE FASTQ FILE IS A CONSTANT.
inFile.readline()
read[QUALS] = inFile.readline().rstrip()
return read
except:
msg = 'Problem loading read:', line
quitOnError(msg)
def writeRead(read, outFile):
"""
Writes a read to the read file. Read is expected to be a 4-item
list as returned by nextRead().
"""
outFile.write(read[HEADER] + '\n') # header
outFile.write(read[SEQUENCE] + '\n') # sequence
outFile.write('+\n') # +
outFile.write(read[QUALS] + '\n') # quals
#------------------------------------------------------------------------------------------
# TRIMMING FUNCTIONS
#------------------------------------------------------------------------------------------
# truncate 3' end of reads. Truncated reads don't count as having been
# trimmed as this impacts all reads.
def cut3(read, nCut):
"""
Cuts nCut bases from 3' end of read.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# trim sequence
seq = seq[:(length - nCut)]
# need to trim quals in same way we trimmed sequence
quals = quals[:(length - nCut)]
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
return read
# truncate 5' end of reads. Truncated reads don't count as having been
# trimmed as this impacts all reads.
def cut5(read, nCut):
"""
Cuts nCut bases from 5' end of read.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# trim sequence
seq = seq[nCut:]
# need to trim quals in same way we trimmed sequence
quals = quals[nCut:]
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
return read
# Replace base with N if quality score is below the user-specified
# phredThreshold. Low quality bases at the ends of the read will be
# removed if -rN flag is set. We do not change the quality score, just
# the base.
def qualityScoreThreshold(read, phredThreshold, isFirstRead):
"""
Replace low quality bases with N.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
wasTrimmed = False # local flag for trimming
# check quality scores
i = 0
newSeq = ""
for qualityScore in quals:
if ord(qualityScore) < phredThreshold:
newSeq += "N"
wasTrimmed = True
else:
newSeq += seq[i]
i += 1
if wasTrimmed:
# track number of reads that have a base swap
read["flags"].append('qualityScoreThreshold')
if DEBUG: read["debug"].append([read[SEQUENCE], length, newSeq, length, 'qualityScoreThreshold', ''])
read[SEQUENCE] = newSeq
return read
# remove N's on either end of the read
def removeNs(read, isFirstRead):
"""
Removes any N's from the 3' and 5' ends of a sequence.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# local flag for trimming having occurred. This is unnecessary
# since removeNs() is first trimming to happen. However this
# prevents possible problem in future if we add another trimming
# option prior to removeNs()
wasTrimmed = False
# trim N from beginning of read (5' end)
if seq.startswith('N'):
# trim sequence
seq = seq.lstrip('N')
# need to trim quals in same way we trimmed sequence
quals = quals[len(quals) - len(seq):]
# flag read as having been trimmed
wasTrimmed = True
read["flags"].append('remove5N')
# trim N from end of read (3' end)
if seq.endswith('N'):
# trim sequence
seq = seq.rstrip('N')
# need to trim quals in same way we trimmed sequence
quals = quals[:len(seq)]
# flag read as having been trimmed
wasTrimmed = True
read["flags"].append('remove3N')
if wasTrimmed:
if DEBUG: read["debug"].append([read[SEQUENCE], length, seq, len(seq), 'removeNs', ''])
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
# remove runs of 6 or more of the same base on either end of the read
def removePolyBases(read, isFirstRead, nMin = 6):
"""
Removes repeated strings of bases from both the 3' and 5' ends of a sequence.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# local flag for trimming having occurred. This is unnecessary
# since removeNs() is first trimming to happen. However this
# prevents possible problem in future if we add another trimming
# option prior to removeNs()
wasTrimmed = False
# only trim each end once
leftTrimmed = False
rightTrimmed = False
for base in ['A', 'T', 'C', 'G']:
if not leftTrimmed:
# trim N from beginning of read (5' end)
if seq.startswith(base * nMin): # 'A' * 4 = 'AAAA' etc
# trim sequence
seq = seq.lstrip(base)
# need to trim quals in same way we trimmed sequence
quals = quals[len(quals) - len(seq):]
# flag read as having been trimmed
wasTrimmed = True
leftTrimmed = True
if 'remove5'+ base not in read["flags"]:
read["flags"].append('remove5' + base)
if not rightTrimmed:
# trim N from end of read (3' end)
if seq.endswith(base * nMin):
# trim sequence
seq = seq.rstrip(base)
# need to trim quals in same way we trimmed sequence
quals = quals[:len(seq)]
# flag read as having been trimmed
wasTrimmed = True
if 'remove3'+ base not in read["flags"]:
read["flags"].append('remove3' + base)
if wasTrimmed:
if DEBUG: read["debug"].append([read[SEQUENCE], length, seq, len(seq), 'removeNs', ''])
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
# trim contaminant removing entire contaminant based on single k-mer
# mapping, from the 5' end of the contaminant to the 3' end of the
# read.
def fullContaminantTrimming(read, contaminant, isFirstRead):
"""
Trim read based on contaminant. Contaminant is expected to be a
tuple containing a set of trimming options and the contaminant
sequence. For trimming first map a k-mer then trim the read from
the expected start point of the contaminant based on the position
of the k-mer within the contaminant to the end of the read. Note
that this function is an agressive trimming scheme and may remove
portions of the read that don't map to the contaminant.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# list of contaminant k-mers to look for in read
kmers = contaminant[2]
# load contaminant's options
options = contaminant[0]
# contaminant name
name = options['name']
# we attempt to perfectly align each k-mer against the
# sequence. If any alignment succeeds, the position of the 5' end
# of the contaminant is used for trimming. For example if the
# mapped k-mer begins at the 5th base in sequence A then sequence
# A is assumed to extend another 4 bases toward the 5' end of
# sequence B and hence that position in sequence B is used for
# trimming. If none of the k-mer's map then no trimming occurs.
pos = -1
for kmer, offset in kmers:
# look for k-mer in read sequence. Index is the 5' position in
# the read where the k-mer mapped. If the k-mer maps multiple
# times in the read, then the mapping closest to the 3' end is
# used.
index = seq.rfind(kmer)
if index > -1:
# the read sequence may not contain the entire contaminant
# so change the order of the k-mer search based on the end
# we are searching. Search the k-mers that are located
# near the 5' end of the contaminant before searching for
# the k-mers located near the 3' end of the contaminant.
pos = index - offset
# break out of loop since we found a match
break
# if k-mers not found, then no trimming
if pos == -1: return read
seq = seq[:pos]
quals = quals[:pos]
if DEBUG: read["debug"].append([read[SEQUENCE], length, seq, len(seq), 'fullContaminantTrimming', name])
read[FLAGS].append(name)
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
# trim contaminant removing the portion of the contaminant that maps
# from the k-mer region to the end of the sequence.
def mappedContaminantTrimming(read, contaminant, isFirstRead):
"""Trim read based on contaminant. Contaminant is expected to be a
tuple containing a set of trimming options and the contaminant
sequence. For trimming first map a k-mer then extend the mapping
toward the 5' end of the read. After the mapping is fully
extended, trim the read from the contaminant mapping region to the
3' end of the read. Note that this function is a more conservative
trimming scheme and may miss contaminate regions that are smaller
than the size of a k-mer.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# contaminant sequence
cSeq = contaminant[1]
cLength = len(cSeq)
# list of contaminant k-mers to look for in read
kmers = contaminant[2]
# load contaminant's options
options = contaminant[0]
# contaminant name
name = options['name']
# we attempt to perfectly align each k-mer against the
# sequence. Once we find a match, then we extend the mapping
# toward the 5' end of the read, only allowing for exact
# matches. We then trim from the mapped region of the read to the
# 3' end of the read. For example if the mapped k-mer begins at
# the 5th base in seuence A then we extend the k-mer mapped toward
# the 5' end of the sequences. The position returned is the last
# mapped base on the 5' end of the mapping. If none of the k-mer's
# map then no trimming occurs.
pos = -1
for kmer, offset in kmers:
# when we set pos we break out of the inner loop. Here we
# break out of the outer loop.
if pos > -1: break
# look for k-mer in read sequence. Index is the 5' position in
# the read where the k-mer mapped. We're expecting the k-mer
# to map to the 3' end of the read so we search the k-mers
# that are located near the 5' end of the contaminant before
# searching for the k-mers located near the 3' end of the
# contaminant. If the k-mer maps multiple times in the read,
# then the mapping closest to the 3' end is used.
index = seq.rfind(kmer)
if index > -1:
# found mapping so now extend mapping toward 5' end
pos = 0
while offset > 0 and index > 0:
# offset is the position of the 5' end of the k-mer
# in the contaminant. So if equal to 0 then we are
# already at the 5' end of the contaminant. index
# is our position in the read sequence. If that's
# 0 then we are at the 5' end of the read.
if seq[index-1] != cSeq[offset-1]:
pos = index
break
else:
index -= 1
offset -= 1
# if k-mers not found, then no trimming
if pos == -1: return read
seq = seq[:pos]
quals = quals[:pos]
if DEBUG: read["debug"].append([read[SEQUENCE], length, seq, len(seq), 'mappedContaminantTrimming', name])
read[FLAGS].append(name)
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
# trim contaminant removing entire contaminant based on single k-mer
# mapping and whether or not the percent identity between the
# contaminant and the read is above the specified thresholds
def identityTrimming(read, contaminant, isFirstRead):
"""
Trim read based on contaminant. Contaminant is expected to be a
tuple containing a set of trimming options and the contaminant
sequence. For trimming first map a k-mer to align the contaminant
to the read. Then compute the percent identity between the
contaminant and the read. If the percent identity is above the
specified threshold then trim read from the position of the 5' end
of the contaminant to the 3' end of the read. If the threshold is
not reached then do not trim the read. With a high threshold this
function will bias away from trimming. As the threshold is
decreased (toward 0) then the likelyhood of trimming increases.
Note that the identity is only computed for the portion of the
read that overlaps with the contaminant. So if a short contaminant
perfectly maps in the middle of a long read then it would have a
high percent identity, causing the read to be trimmed.
Contaminant example: [{'name': 'indexAdapter', 'windows': 6,
'percentIdentity': 0.9, 'method': 2, 'pecentIdentity': 0.9,
'totalIdentity': 16, 'size': 10},
'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC', [('AGATCGGAAG', 0),
('CGGAAGAGCA', 4), ('AGAGCACACG', 8), ('CACACGTCTG', 12),
('CGTCTGAACT', 16), ('TGAACTCCAG', 20), ('CTCCAGTCAC', 24)]]
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
# contaminant sequence
cSeq = contaminant[1]
cLength = len(cSeq)
# list of contaminant k-mers to look for in read
kmers = contaminant[2]
# load contaminant's options
options = contaminant[0]
# contaminant name
name = options['name']
# percent identity threshold
percIdentityThreshold = options['percentIdentity']
# total identity threshold
totIdentityThreshold = options['totalIdentity']
# we attempt to perfectly align each k-mer against the
# sequence. If any alignment succeeds, then we compute the percent
# identity between the contaminant and read. If above our
# threshold then trim the entire contaminant, otherwise do not
# trim anything.
percIdentity = 0
totIdentity = 0
for kmer, offset in kmers:
# look for k-mer in read sequence. Index is the position from
# the 5' end of the read where the k-mer mapped to the read.
# If the k-mer maps multiple times in the read, then the
# mapping closest to the 3' end is used.
index = seq.rfind(kmer)
if index > -1:
# Pos is the position where the 5' end of contaminant
# overlaps the read, based in k-mer mapping. Offset is
# the position of the 5' end of the k-mer within the
# contaminant.
pos = index - offset
# cPos is the position in the contaminant
cPos = 0
# if the contaminant hangs off the 5' end of the read then
# we need to adjust the search to begin from the 5' end of
# the read
if pos < 0:
# shift where we begin in the contaminant based on how
# many bases are hanging off the 5' end of the read
cPos = pos * -1
# start at 5' end of read
pos = 0
# rPos is the position in the read
rPos = pos
totIdentity = 0
count = 0
while cPos < cLength and rPos < length:
# count the number of matching bases
if seq[rPos] == cSeq[cPos]: totIdentity += 1
# increment positions so we check next base
cPos += 1
rPos += 1
# count the number of bases checked
count += 1
# we've now checked all overlapping bases, so we can
# compute the percent identity.
percIdentity = float(totIdentity) / float(count)
# break out of loop since we found a match
break
# if k-mers not found, then no trimming
if index == -1: return read
# if the k-mers were not found or the contaminant didn't map
# sufficiently to the read then we don't trim.
if (percIdentity < percIdentityThreshold) or (totIdentity < totIdentityThreshold): return read
seq = seq[:pos]
quals = quals[:pos]
if DEBUG:
misc = name + "\t" + str(percIdentity) + "\t" + str(totIdentity)
read["debug"].append([read[SEQUENCE], length, seq, len(seq), 'identityTrimming', misc])
read[FLAGS].append(name)
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
# remove poly A from 3' end and poly T from 5' end
def removePolyAT(read, trimLength, isFirstRead):
"""
Removes trimLength number of A's from the 3' end and T's from the
5' end of a sequence. First look to see if there are at least 10
A's or 10 T's. If so then trim all A's or all T's.
"""
seq = read[SEQUENCE]
quals = read[QUALS]
length = read[LENGTH]
wasTrimmed = False # local flag for trimming
# trim poly-T from 5' of read
if seq.startswith(POLY_T):
# only remove up to trimLength number of bases from poly-T.
# Get copy of trimLength number of bases in sequence.
n_mer = seq[:trimLength]
# remove poly-T from 5' end of n-mer
n_mer = n_mer.lstrip('T')
# append remaining bases, if any, back onto 5' end of read
seq = n_mer + seq[trimLength:]
# need to trim quals to same degree that we trimmed sequence
quals = quals[len(quals) - len(seq):]
# flag read as having been trimmed
wasTrimmed = True
read[FLAGS].append("polyT")
# trim poly-A from 3' end of read
if seq.endswith(POLY_A):
# only remove up to trimLength number of bases from poly-A.
# Get copy of last trimLength number of bases in sequence.
idx = len(seq) - trimLength
n_mer = seq[idx:]
# remove poly-A from 3' end of n-mer
n_mer = n_mer.rstrip('A')
# append remaining bases back onto 3' end of read
seq = seq[:idx] + n_mer
# need to trim quals in same way we trimmed sequence
quals = quals[:len(seq)]
# flag read as having been trimmed
wasTrimmed = True
read[FLAGS].append("polyA")
if wasTrimmed:
if DEBUG: read["debug"].append(read[SEQUENCE], length, seq, len(seq), 'removePolyAT', '')
read[SEQUENCE] = seq
read[QUALS] = quals
read[LENGTH] = len(seq)
read[TRIMMED] = True
return read
#------------------------------------------------------------------------------------------
# LOAD CONTAMINANTS
#------------------------------------------------------------------------------------------
def computeKMers(seq, kmerSize, numWindows):
"""
This will return a tuple of k-mers that includes the k-mer
sequence. The tuple will also include either 5' position of the
k-mer within the original sequence. The k-mers are ordered from
the 5' end of the contaminant to the 3' end. We don't expect the
read sequence to contain the entire contaminant so we start at the
5' end when we are searching. Expecting the k-mer to map to the 3'
end of the read then we search the k-mers that are located near
the 5' end of the contaminant before searching for the k-mers
located near the 3' end of the contaminant.
"""
seqLength = len(seq)
# the last window is bounded by the length of the sequence and the size of a k-mer
lastWindow = seqLength - kmerSize
# divide up the length of the sequence allowing for the first
# k-mer to be at the beginning of the sequence and the last k-mer
# to be approximate at the end of the sequence. The ordering of
# the k-mers is dependent on which end of the read is expected to
# contain the contaminant. Depending on the length of the
# contaminant, the k-mer size and the number of windows, the last
# few bases on the tail end of the contaminant might not be
# included in any k-mer. For a contaminant expected to be found on
# the 3' end of a read this means the last few bases on the 3' end
# of the contaminant might not be in any k-mer. For a contaminant
# expected to be found on the 5' end of a read then the last few
# bases on the 5' end of the contaminant might not be in a k-mer.
if numWindows > 1:
windowStep = lastWindow / (numWindows - 1)
else:
# make sure we only get one k-mer
windowStep = seqLength
kmerList = []
# slice up the sequence based on the window step size and the k-mer size
for index in range(0, lastWindow + 1, windowStep):
# store k-mer and the k-mer position offset for a 3'
# mapping. Offset is the 5' position of the k-mer.
kmerList.append((seq[index:index+kmerSize], index))
return kmerList
def debugTrimOutput(origSeq, origLen, trimSeq, trimLen, trimType, misc):
"""
Output information about the trim event.
"""
assert DEBUG == True
debugFile.write(origSeq + '\t' + str(origLen) + '\t' + trimSeq + '\t' + str(trimLen) + '\t' + trimType + '\t' + misc + '\n')
def main():
argParser = argparse.ArgumentParser(version=VERSION,
description='Trim NGS reads (requires Python 2.7 or later).',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='' +
'Trimming will happen in the following order depending on which options are selected:\n' +
'\t1. Cut 3\' end of read.\n' +
'\t2. Cut 5\' end of read.\n' +
'\t3. Replace any base with N if the quality score is below the Phred threshold. Quality score is not changed.\n' +
'\t4. Remove N\'s from both ends.\n' +
'\t5. Process contaminants file, removing contaminants based on their order in the contaminants file.\n' +
'\t6a. Single-end: discard read if shorter than the minimum length.\n' +
'\t6b. Paired-end: if only one of the paired reads is shorter than the minimum length, then replace that read\'s sequence with N\'s and replace that read\'s quality scores with #. If both paired reads are shorter than the minimum length, then discard the read pair.\n' +
'\t7. Pad paired reads with N\'s so that they are both the same length. For every N that is added, also add a # to the quality score.\n' +
'\t8. Add "L:X" to the read header with X being the new length of the sequence (including any N\'s that were added to the sequence).\n\n' +
'Contaminants file (fasta-like file):\n' +
'\t1. Sequences must be on a single line (ie can\'t contain line breaks).\n' +
'\t2. No blank lines in file.\n' +
'\t3. Sequence header is space delimited list of options that begins with a \'>\' (similar to fasta files). Option names and values should be separated by a colon. \n' +
'\t\t Example header "> name:oligo end:3 size:10 windows:5"\n' +
'\t4. Options:\n' +
'\t\t * name: this option is required for every sequence\n' +
'\t\t * method: there are three trimming methods (0 = full contaminant, 1 = mapped contaminant, 2 = identity based). (default 2)\n' +
'\t\t\t 0. Full contaminant trimming means that when a k-mer is mapped then it is expected that the entire contaminant mapped and the read is trimmed accordingly. For example lets assume we have a k-mer that is located 4 bases from the 5\' end of a contaminant. If that k-mer maps then we would shift where we trim the read by 4 bases in the direction of the 5\' end of the read. We would then remove all bases from that position to the 3\' end of the read, regardless of the additional bases mapped to the contaminant.\n' +
'\t\t\t 1. Mapped contaminant trimming means that when a k-mer is mapped then we extend the mapping and trimmed accord to the mapping. For example lets assume we have a k-mer that is located 4 bases from the 5\' end of a contaminant. If that k-mer maps then we would extend the mapped region one base at a time, in the 5\' direction until we found a base that didn\'t map. We would then trim from that postion to the 3\' end of the read.\n' +
'\t\t\t 2. If a k-mer maps to the read then the location of the mapping is used to anchor the contaminant to the read. The percent and total identity between the contaminant and the read is computed. If both the percent and total identity are above a user-defined threshold then the read is trimmed from the beginning of the contaminant to the 3\' end of the read. If not then the read is not trimmed.\n' +
'\t\t * size: size of k-mer (default 7)\n' +
'\t\t * windows: how many k-mers to seek. can not be larger than (contaminant length - k-mer). (default 6)\n' +
'\t\t * percentIdentity: percent identity threshold for trimming method 2 (0.0 < percentIdentity <= 1.0). (default 0.9)\n' +
'\t\t * totalIdentity: total identity threshold for trimming method 2. If this is less than the k-mer size then it will have no impact on trimming. (default 16)\n'
)
argParser.add_argument( '-p', '--padPaired', dest='padPaired', action='store_true', default=False,
help='Pad paired reads so that they are the same length after all trimming has occured. N\'s will be added to the 3\' end with \'#\' added to the quality score for each N that is added. This will not do anything for single-end reads. (default: no)' )
argParser.add_argument( '-m', '--minLen', dest='min_len', action='store', default=0, type=int,
help='Minimum size of trimmed read. If trimmed beyond minLen, then read is discarded. If read is paired then read is replaced with N\'s, unless both reads in pair are smaller than minLen in which case the pair is discarded. (default: no minimum length)' )
argParser.add_argument( '-c3', '--cut3', dest='num_cut_3', action='store', default=0, type=int,
help='number of bases to remove from 3\' end of read. TRUNCATING READS DOES NOT COUNT TOWARD TRIMMING TOTALS. This happens prior to the removing of N\'s and hence prior to contaminant trimming. (default: 0)' )
argParser.add_argument( '-c5', '--cut5', dest='num_cut_5', action='store', default=0, type=int,
help='number of bases to remove from 5\' end of read. TRUNCATING READS DOES NOT COUNT TOWARD TRIMMING TOTALS. This happens prior to the removing of N\'s and hence prior to contaminant trimming. (default: 0)' )
argParser.add_argument( '-q', dest='phred_threshold', action='store', default=0, type=int,
help='threshold for Phred score below which the base will be changed to an N. This happens prior to the removing of N\'s and hence low quality bases at the ends of the read will be removed if this is used in conjunction with the -rN flag. QUALITY SCORES ARE NOT SCALED BASED ON ENCODING SCHEME. For example, use a threshold of 53 to filter Illumina 1.8 fastq files (Phred+33) based on a Phred score of 20. REPLACING BASES WITH N DOES NOT COUNT TOWARD TRIMMING TOTAL. (default: 0)' )
argParser.add_argument( '-rN', '--removeNs', dest='removeN', action='store_true', default=False,
help='remove N\'s from both ends of the read. This trimming happens before contaminant trimming. (default: no)' )
argParser.add_argument( '-rAT', '--removePolyAT', dest='remove_AT', action='store', default=-1, type=int,
help='length of 3\' poly-A and 5\' poly-T to remove from the respective ends of the read. If all poly A/T is to be removed then the value should be equal to or greater than the length of the read. A minimum of ten A\'s or ten T\'s must exist in order for this trimming to happen, regardless of the trimming length; that is, poly-A and poly-T fragments are defined as being at least 10 nt in length. A sequences of A\'s or T\'s are ignored. This trimming happens after contaminant trimming. (default: no trimming)' )
argParser.add_argument( '-rPoly', '--removePolyBases', dest='remove_ATCG', action='store_true', default=False,
help='Remove runs of 6 or more of the same base from either end. Potentially useful for removing poly-G strings created by NextSeq problems, or with protocols that can create long runs of the same base. This trimming happens after contaminant trimming. (default: no trimming)' )
argParser.add_argument( '-c', dest='contaminants_fa', action='store', default=None,
help='fasta-like file containing list of contaminants to trim from the 3\' end of the read' )
argParser.add_argument( '-f', dest='first_fq', action='store', required=True,
help='fastq file with reads to be trimmed' )
argParser.add_argument( '-r', dest='second_fq',
help='second read with paired-end reads. This file is not present for single-end reads.' )
argParser.add_argument( '-o', dest='output_prefix', action='store', required=True,
help='prefix for output file(s). A \'_1\' will be appended to the first reads output file and if paired reads then a \'_2\' will be appended to the second reads file. Output files similarly named and with the suffix \'.disc.txt\' will be created to store the fastq headers for reads shorter than the minimum length threshold after trimming.' )
argParser.add_argument( '-t', dest='threads', type=int, default=4,
help='Number of worker processes. Set to 1 for serial execution in a single process. Increasing above 6 is unlikely to produce any speedup. (default: 4)' )
argParser.add_argument( '-C', '--chunk-size', dest='chunk_size', type=int, default=200,
help='Number of reads passed to each worker thread at a time. For 200 read chunks and 5 threads, blocks of 1000 reads would be read in at a time and split between those 5 threads. The "sweet spot" of low memory usage and high cpu utilization appears to be fairly low, somewhere between 100-200, in inverse proportion to the number of threads.' )
global clArgs
clArgs = argParser.parse_args() #Again, global so each thread can read it. Should be read-only from here out
if DEBUG: print clArgs
# flag if paired-end reads
global PAIRED
if clArgs.second_fq:
PAIRED = True
if DEBUG:
try:
debugFile = open("debug.tsv", 'w')
debugFile.write('Orig Seq\tLen\tTrimmed Seq\tLen\tTrim Type\tMisc\n')
except:
quitOnError('Unable to open output file debug.tsv')
#------------------------------------------------------------------------------------------
# LOAD CONTAMINANTS
#------------------------------------------------------------------------------------------
nFirstContaminantsTrim = {} # number of each contaminant trimmed from first read
nSecondContaminantsTrim = {} # number of each contaminant trimmed from second read
if clArgs.contaminants_fa:
try: contFile = open(clArgs.contaminants_fa, 'r')
except IOError: print 'ERROR: No such file', clArgs.contaminants_fa
# need to retain order so using tuple instead of set
global contaminantList
contaminantList = []
count = 0
for line in contFile:
if line.startswith('>'):
header = line[1:].strip()
# need to process header arguments
args = header.split()
# convert options string into set. Initialize values to
# defaults. The initialized set doesn't include the name
# as name is required.
options = { 'method':DEFAULT_METHOD,
'size':DEFAULT_SIZE_KMER,
'windows':DEFAULT_WINDOWS_NUMBER,
'percentIdentity':DEFAULT_PERCENT_IDENTITY,
'totalIdentity':DEFAULT_TOTAL_IDENTITY }
for option in args:
key, value = option.split(':')
if key == 'method':
value = int(value)
if value != 0 and value != 1 and value != 2:
msg = 'Invalid "method" option (%d) for contaminant. Must be 0, 1, or 2.' % value
quitOnError(msg)
elif key == 'size':
value = int(value)
elif key == 'windows':
value = int(value)
elif key == 'totalIdentity':
value = int(value)
elif key == 'percentIdentity':
value = float(value)
if value <= 0.0 or value > 1.0:
msg = 'Invalid "percentIdentity" option (%f) for contaminant. Percent identity must be greater than 0 and less than or equal to 1.' % value
quitOnError(msg)
# at this point we've accounted for all options but
# the name. If we don't have a name then the option
# doesn't exist.
elif key != 'name':
msg = 'Invalid contamination option "%s".' % option
quitOnError(msg)
# save value in options
options[key] = value
# make sure we loaded a name.
if 'name' not in options:
quitOnError('Contaminant does not have a name.')
# use name to initialize contaminant trimming counts. We
# initialize the counts for N and poly A/T removal below
# (ie outside of the if/then statement) as we might not
# always have a contaminants file.
nFirstContaminantsTrim[options['name']] = 0
if PAIRED:
nSecondContaminantsTrim[options['name']] = 0
else:
seq = line.strip().upper()
# compute set of k-mers
kmerList = computeKMers(seq, options['size'], options['windows'])
if len(seq) < options['size']:
msg = 'The k-mer size (%d) must be smaller than the length of the sequence (%s).' % (options['size'], seq)
quitOnError(msg)
# save contaminant and related values in tuple
contaminantList.append([options, seq, kmerList])
count += 1
contFile.close()
if DEBUG:
for c in contaminantList: print c
print
print 'Loaded contaminants: ' + str(count)
#------------------------------------------------------------------------------------------
# INITIALIZE STATS
#------------------------------------------------------------------------------------------
# track trimming stats
# TODO: do these still work?
nBothTrimmed = 0 # total number of read pairs in which both reads were trimmed (equal to nFirstTrimmed if single-end)
nFirstTrimmed = 0 # num first reads trimmed
nSecondTrimmed = 0 # num second reads trimmed
nBothDiscarded = 0 # num read pairs discarded (equal to nFirstDiscarded if single-end)
nFirstDiscarded = 0 # num of first reads replaced with N's
nSecondDiscarded = 0 # num of second reads replaced with N's
nTotalReadPairs = 0 # total number of read pairs processed
# end TODO
# need to initialize the trim counters for N removal and poly A/T
# removal. We initialized the counters for contaminants in the if/then
# statement above.
nFirstContaminantsTrim['remove5N'] = 0
nFirstContaminantsTrim['remove3N'] = 0
nFirstContaminantsTrim['polyA'] = 0
nFirstContaminantsTrim['polyT'] = 0
nFirstContaminantsTrim['qualityScoreThreshold'] = 0
# Initialize stat counters for poly base trimming at the end of each read
if clArgs.remove_ATCG:
nFirstContaminantsTrim['remove5A'] = 0
nFirstContaminantsTrim['remove3A'] = 0
nFirstContaminantsTrim['remove5T'] = 0
nFirstContaminantsTrim['remove3T'] = 0
nFirstContaminantsTrim['remove5C'] = 0
nFirstContaminantsTrim['remove3C'] = 0
nFirstContaminantsTrim['remove5G'] = 0
nFirstContaminantsTrim['remove3G'] = 0
if PAIRED:
nSecondContaminantsTrim['remove5N'] = 0
nSecondContaminantsTrim['remove3N'] = 0
nSecondContaminantsTrim['polyA'] = 0
nSecondContaminantsTrim['polyT'] = 0
nSecondContaminantsTrim['qualityScoreThreshold'] = 0
if clArgs.remove_ATCG:
nSecondContaminantsTrim['remove5A'] = 0
nSecondContaminantsTrim['remove3A'] = 0
nSecondContaminantsTrim['remove5T'] = 0
nSecondContaminantsTrim['remove3T'] = 0
nSecondContaminantsTrim['remove5C'] = 0
nSecondContaminantsTrim['remove3C'] = 0
nSecondContaminantsTrim['remove5G'] = 0
nSecondContaminantsTrim['remove3G'] = 0
#------------------------------------------------------------------------------------------
# OPEN READ INPUT AND OUTPUT FILES
#------------------------------------------------------------------------------------------
# open input file
firstReadIn = ''
try:
firstReadIn = open(clArgs.first_fq, 'r')
except:
msg = 'Unable to load reads file ' + clArgs.first_fq
quitOnError(msg)
# open file that will store non-discarded reads
firstReadOut = ''
try:
firstReadOut = open(clArgs.output_prefix + "_1.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_1.fq"
quitOnError(msg)
# open file that will store discarded reads
firstReadDiscardedOut = ''
try:
firstReadDiscardedOut = open(clArgs.output_prefix + "_1.disc.txt", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_1.disc.txt"
quitOnError(msg)
if PAIRED:
secondReadIn = ''
try:
secondReadIn = open(clArgs.second_fq, 'r')
except:
msg = 'Unable to load reads file ' + clArgs.second_fq
quitOnError(msg)
secondReadOut = ''
try:
secondReadOut = open(clArgs.output_prefix + "_2.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_2.fq"
quitOnError(msg)
secondReadDiscardedOut = ''
try:
secondReadDiscardedOut = open(clArgs.output_prefix + "_2.disc.txt", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_2.disc.txt"
quitOnError(msg)
#------------------------------------------------------------------------------------------
# TRIM READS. LOAD ONE READ FROM BOTH FQ FILES AT SAME TIME. PROCESS
# READ PAIR TOGETHER AND THEN EITHER WRITE TO OUTPUT FILE OR DISCARD.
# ------------------------------------------------------------------------------------------
# If only one thread, don't use the overhead of starting a pool
# Multiprocessing also mangles stack traces, so for debugging 1 thread is desirable
if clArgs.threads > 1:
pool = multiprocessing.Pool(clArgs.threads)
#--------------------------------------------------------------------------------------
# main loop
#--------------------------------------------------------------------------------------
done = False
runningJob = None
results = []
# If there's either a running job to be processed, or we haven't read all the input, loop
while runningJob or not done:
#--------------------------------------------------------------------------------------
# build read chunk. we use chunks to prevent the memory overhead of splitting all reads at once
chunk = []
if not done: #done set to True when we read the last record
for i in xrange(clArgs.chunk_size * clArgs.threads):
firstRead = nextRead(firstReadIn)