-
Notifications
You must be signed in to change notification settings - Fork 0
/
snATAC_r3fan.R
65 lines (58 loc) · 3.02 KB
/
snATAC_r3fan.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
setwd("e:/Soloway/sciATAC/")
##################################################################
# 1) reads per barcode >= 1000
# 2) consecutive promoter coverage > 3%
# 3) reads in peak ratio >= 20%
# NOTE: The cutoff can vary singificantly between different samples
# 4) filter potential doublets (OPTIONAL NOT SUGGUESTED, UNSTABLE)
##################################################################
setwd("20180321_mix_data//")
consecutive_promoters <- read.table("20180316_p56_redo/mm10_consecutive_promoters.bed")
num_of_reads = read.table("20180316_p56_redo/p56.rep1.reads_per_cell")
promoter_cov = read.table("20180316_p56_redo/p56.rep1.promoter_cov")
read_in_peak = read.table("20180316_p56_redo/p56.rep1.reads_in_peak")
qc = num_of_reads;
colnames(qc) <- c("barcode", "num_of_reads")
qc$promoter_cov = 0;
qc$read_in_peak = 0;
qc$promoter_cov[match(promoter_cov$V1, qc$barcode)] = promoter_cov$V2/nrow(consecutive_promoters)
qc$read_in_peak[match(read_in_peak$V1, qc$barcode)] = read_in_peak$V2
qc$ratio = qc$read_in_peak/qc$num_of_reads
idx <- which(qc$ratio > 0.2 & qc$num_of_reads > 1000)
qc_sel <- qc[idx,]
# among these cells, further filter PUTATIVE DOUBLETS ((OPTIONAL NOT SUGGUESTED, UNSTABLE))
#pvalues <- sapply(qc_sel$num_of_reads, function(x)
# poisson.test(x, mean(qc_sel$num_of_reads),
# alternative="greater")$p.value)
#fdrs <- p.adjust(pvalues, "BH")
#idx <- which(fdrs < 1e-2)
write.table(qc_sel[,1], file = "2kb.no_ConstPromFiltering.xgi", append = FALSE,
quote = FALSE, sep = "\t", eol = "\n",
na = "NA", dec = ".", row.names = FALSE,
col.names = FALSE, qmethod = c("escape", "double"),
fileEncoding = "")
# Feature Selection
##################################################################
# 1) Filter top 5% peaks
# 2) Filter promoters
# 3) extend and merge
##################################################################
library(GenomicRanges)
peaks.df <- read.table("20180316_p56_redo/p56.rep1_peaks.narrowPeak")
# remove top 5% peaks
cutoff <- quantile((peaks.df$V5), probs = 0.95)
peaks.df <- peaks.df[which(peaks.df$V5 < cutoff),]
proms.df <- read.table("20180316_p56_redo/refseq_mm10.chopped.bed")
peaks.gr <- GRanges(peaks.df[,1], IRanges(peaks.df[,2], peaks.df[,3]))
proms.gr <- GRanges(proms.df[,1], IRanges(proms.df[,2], proms.df[,3]))
peaks.sel.gr <- proms.df[-queryHits(findOverlaps(peaks.gr, proms.gr))]
peaks.sel.ex.gr <- resize(reduce(resize(peaks.sel.gr, 1000,
fix="center")), 1000, fix="center")
# note that I changed the below input from peaks.sel.ex.gr to peaks.sel.gr because I didn't want to resize the 2kb file
peaks.sel.ex.df <- as.data.frame(peaks.sel.gr)[,1:3]
write.table(peaks.sel.ex.df, file = "2kb_window.ygi",
append = FALSE, quote = FALSE, sep = "\t",
eol = "\n", na = "NA", dec = ".",
row.names = FALSE, col.names = FALSE,
qmethod = c("escape", "double"),
fileEncoding = "")