-
Notifications
You must be signed in to change notification settings - Fork 21
/
filter_ffpe.R
executable file
·113 lines (92 loc) · 4.66 KB
/
filter_ffpe.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env Rscript
##########################################################################################
# MSKCC CMO
# Identify samples with FFPE artifacts or filter the artifacts from a maf file.
##########################################################################################
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
revc <- function(x) strReverse(chartr('ACGT', 'TGCA', x))
# Annotate maf with Stratton Plot bin
add_mut_tri <- function(maf) {
if (!"TriNuc" %in% names(maf)) {
if (!"flanking_bps" %in% names(maf)) {
stop("must have TriNuc/flanking_bps column")
}
}
### check for t_var_freq
if (!'t_var_freq' %in% names(maf))
maf[!t_ref_count %in% c(NA,'.') & !t_alt_count %in% c(NA,'.'),
t_var_freq := as.numeric(t_alt_count)/(as.numeric(t_alt_count)+as.numeric(t_ref_count))]
### reverse complement Ref_Tri if ref is either G or A
if("TriNuc" %in% names(maf)){
maf[Variant_Type == "SNP",
Ref_Tri := ifelse(Reference_Allele %in% c('G', 'A'),
revc(TriNuc),
TriNuc)]
}else{
maf[Variant_Type == "SNP",
Ref_Tri := ifelse(Reference_Allele %in% c('G', 'A'),
revc(flanking_bps),
flanking_bps)]
}
### reverse complement Tumor_Seq_Allele2 if ref is either G or A
Tumor_Seq_Allele2_CT <- maf[Variant_Type == "SNP",
ifelse(Reference_Allele %in% c('G', 'A'),
revc(Tumor_Seq_Allele2),
Tumor_Seq_Allele2)]
### combine Ref_Tri and Tumor_Seq_Allele2
### (with conditional reverse compliment)
maf[Variant_Type == "SNP",
Mut_Tri := paste0(substr(Ref_Tri, 1, 2),
Tumor_Seq_Allele2_CT,
substr(Ref_Tri, 3, 3))]
maf
}
# For each sample, this function returns
# the fraction of low allele fraction mutations that have NCG > NTG context
# Useful to identify samples with FFPE artifacts.
identify_artifacts <- function(maf, threshold = 0.1){
m <- merge(
maf[t_var_freq < threshold, list(frac_lo = mean(Mut_Tri %like% "CTG$")),
keyby = Tumor_Sample_Barcode],
maf[t_var_freq > threshold, list(frac_hi = mean(Mut_Tri %like% "CTG$")),
keyby = Tumor_Sample_Barcode])
m[, frac_diff := frac_lo - frac_hi]
m <- m[order(frac_diff)]
m
}
# Remove low allele fraction mutations with the C>T context
filter_artifacts <- function(maf, threshold = 0.1) {
if (!('FILTER' %in% names(maf))) maf$FILTER = '.'
maf.annotated = maf[, ffpe_artifact := t_var_freq <= threshold & Mut_Tri %like% ".CT."]
maf.annotated <- maf[, FILTER := ifelse(FILTER == '.' & ffpe_artifact == TRUE, 'ffpe_artifact',
ifelse(FILTER != '.' & ffpe_artifact == TRUE,
paste0(FILTER, ',ffpe_artifact'), FILTER))]
maf.annotated
}
if (!interactive()) {
pkgs = c('data.table', 'argparse')
junk <- lapply(pkgs, function(p){suppressPackageStartupMessages(require(p, character.only = T))})
rm(junk)
parser=ArgumentParser()
parser$add_argument('-m', '--maf', type = 'character', help = 'SOMATIC_FACETS.vep.maf file', default = 'stdin')
parser$add_argument('-t', '--threshold', type = 'double', default = 10, help = 'allele fraction threshold (pct.)')
parser$add_argument('-i', '--identify', action = "store_true", default = FALSE, help = 'Identify samples with FFPE artifacts')
parser$add_argument('-f', '--filter', action = "store_true", default = TRUE, help = 'Filter FFPE artifacts')
parser$add_argument('-o', '--outfile', type = 'character', help = 'Output file', default = 'stdout')
args=parser$parse_args()
outfile = args$outfile
threshold <- as.numeric(args$threshold) / 100
if(args$identify & args$filter) stop("Cannot identify and filter at the same time")
if (args$maf == 'stdin') { maf = suppressWarnings(fread('cat /dev/stdin', colClasses=c(Chromosome="character"), showProgress = F))
} else { maf <- suppressWarnings(fread(args$maf, colClasses=c(Chromosome="character"), showProgress = F)) }
if(args$identify) {
maf <- add_mut_tri(maf)
m <- identify_artifacts(maf, threshold = threshold)
write.table(format(m, digits=2), stdout(), col.names = T, row.names = F, sep = '\t', quote = F)
} else if(args$filter) {
maf <- add_mut_tri(maf)
maf.out <- filter_artifacts(maf)
if (outfile == 'stdout') { write.table(maf.out, na="", stdout(), sep = "\t", col.names = T, row.names = F, quote = F)
} else { write.table(maf.out, outfile, na="", sep = "\t", col.names = T, row.names = F, quote = F) }
}
}