-
Notifications
You must be signed in to change notification settings - Fork 1
/
DESeq_for_ATACseq.Rmd
243 lines (179 loc) · 9.6 KB
/
DESeq_for_ATACseq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
---
title: "DESeq"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
Reference: https://github.com/ThomasCarroll/ATAC_Workshop/blob/master/RU_ATAC_Workshop.Rmd
```{r packages}
# ## In conda r-environment:
# Set up R conda environment.
# $ conda create -n r-environment r-essentials r-base
# $ source activate r-environment
# conda install r-ggplot2
# conda install -c bioconda bioconductor-soggi
# conda install -c bioconda bioconductor-deseq2
# ##
#install.packages("knitr")
#install.packages("rmdformats")
#install.packages("dplyr")
#install.packages("DT")
#install.packages("tidyr")
#install.packages("ggplot2")
#install.packages("magrittr")
#install.packages("devtools")
#source("https://bioconductor.org/biocLite.R")
# Needed for mac and Linux only
#biocLite("Rsubread")
#
#biocLite("Rsamtools")
#biocLite("soGGi")
#biocLite("GenomicAlignments")
#biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
#biocLite("rtracklayer")
#biocLite("ChIPQC")
#biocLite("ChIPseeker")
#biocLite("rGREAT")
#biocLite("limma")
#biocLite("DESeq2")
#biocLite("tracktables")
#biocLite("clusterProfiler")
#biocLite("org.Mm.eg.db")
#biocLite("MotifDb")
#biocLite("Biostrings")
#biocLite("BSgenome.Hsapiens.UCSC.hg19")
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(ChIPseeker)
library(org.Hs.eg.db)
library(ChIPQC)
library(soGGi)
library(dplyr)
library(magrittr)
library(GenomicRanges)
library(limma)
library(Rsubread)
library(DESeq2)
library(clusterProfiler)
library(rGREAT)
library(Rsamtools)
library(ggplot2)
library(rtracklayer)
library(Biostrings)
library(clusterProfiler)
library(GenomicAlignments)
library(tracktables)
```
Set working directory.
```{r setwd, echo=TRUE,eval=TRUE,cache=TRUE}
setwd("/athena/khuranalab/scratch/kac2053/projects/ctDNA/plots/")
```
## Differential ATAC-seq
We have briefly reviewed the processing and initial analysis of one ATAC-seq sample using R.
In the next part we will look at how we can identify changes in open regions using R/Bioconductor.
Here we will take an approach akin that in Diffbind and reasonably esatablished in ATAC-seq analysis.
First, We will define a set of non-redundant peaks present in at least 2 samples and use these to assess changes in nuc-free ATAC-seq signal using DESeq2.
## Identifying a set of non-redundant peaks.
Here we will use soGGi to produce merge our open regions from all samples into a set of non-redundant (no overlapping regions) open regions present in any sample.
```{r processData_consensus, echo=TRUE,eval=TRUE,cache=TRUE}
# Get narrowpeak files.
sample_dir_fanying <- "/athena/khuranalab/scratch/kac2053/projects/ctDNA/Fanying_NEPC_CRPC_ATAC-seq/macs2"
sample_dir_park <- "/athena/khuranalab/scratch/kac2053/projects/ctDNA/GSE118207/macs2"
sample_dir_GM12878 <- "/athena/khuranalab/scratch/kac2053/projects/ctDNA/GSE47753/macs2"
peaks_fanying <- dir(sample_dir_fanying, pattern = "*header.narrowPeak", full.names = TRUE)
peaks_park <- dir(sample_dir_park, pattern = "*header.narrowPeak", full.names = TRUE)
peaks_GM12878 <- dir(sample_dir_GM12878, pattern = "*header.narrowPeak", full.names = TRUE)
peaks <- c(peaks_fanying, peaks_park, peaks_GM12878)
# Obtain sample names.
peak.names <- strsplit(peaks, "/")
peak.names <- lapply(peak.names, tail, n = 1)
peak.names <- unlist(peak.names)
peak.names <- strsplit(peak.names,"_")
peak.names <- lapply(peak.names, head, n = 1)
peak.names <- unlist(peak.names)
# Get GRanges in narrowpeak files and merge all GRanges as set of non-redundant (no overlapping regions) open regions using soGGi.
myPeaks <- lapply(peaks,ChIPQC:::GetGRanges,simple=TRUE)
names(myPeaks) <- peak.names
Group <- factor(c("CRPC", "CRPC", "NEPC", "NEPC", "CRPC", "CRPC", "NEPC", "NEPC", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid"))
prostatevslymph <- factor(c("prostate", "prostate", "prostate", "prostate", "prostate", "prostate", "prostate", "prostate", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid", "lymphoblastoid"))
consensusToCount <- soGGi:::runConsensusRegions(GRangesList(myPeaks),"none")
```
## PCA of overlaps (occupancy analysis).
We can also Diffbind style PCA analysis (Occupancy analysis in Diffbind) of peak overlaps to get an overall view of correspondance between peak calls.
Here we pass the matrix of peak overlaps from soGGi to prcomp function and plot the results in ggplot2.
```{r processData_consensus_diffbindStylePCA, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_consensus"}
library(tidyr)
savePlot <- function(myPlot, plotname) {
pdf(paste0(plotname, ".pdf"))
print(myPlot)
dev.off()
}
pca <- as.data.frame(elementMetadata(consensusToCount)) %>%
dplyr::select(-consensusIDs) %>%
as.matrix %>% t %>% prcomp %>% .$x %>% data.frame %>%
mutate(Samples=rownames(.)) %>%
mutate(Group=gsub("_\\d","",Samples)) %>%
ggplot(aes(x=PC1,y=PC2,colour=Group))+geom_point(size=5)
savePlot(pca, "pca_occurence")
```
## Counting for differential ATAC-seq.
The presense or absense of a peak does not fully capture the changes in ATAC-seq signal observed in a genome broswer. Identifying changes of ATAC-seq signal within peaks will allow us to better capture ATAC-seq signal differences.
To do this we will borrow some methods from RNA-seq, namely DESeq2, to evaluate changes in ATAC-seq signal between groups/tissues.
First we will filter our peaks in a manner similar to Diffbind, where we keep only peaks which are present in at least two replicates.
```{r processData_consensusCounting, echo=TRUE,eval=TRUE,cache=TRUE,dependson="processData_consensus"}
library(Rsubread)
# Count the number of occurences in each genomic range and store as a vector.
occurrences <- elementMetadata(consensusToCount) %>% as.data.frame %>% dplyr::select(-consensusIDs) %>% rowSums
table(occurrences) %>% rev %>% cumsum
# Only save the genomic ranges that occur in 2+ samples.
consensusToCount <- consensusToCount[occurrences >= 2,]
```
Now we have to set of regions to count in we can use Rsubread to count paired reads landing in peaks.
**Note that Rsubread allows for maximum and minimum fragment lengths!**
**This takes awhile so run this after workshop if interested**
**Takes ~ 10 minutes on 3.1 GHz Intel Core i7 Mac pro**
```{r processData_consensusCounting2, echo=TRUE,eval=FALSE,cache=TRUE,dependson="processData_consensusCounting"}
bam_fanying <- dir("/athena/khuranalab/scratch/kac2053/projects/ctDNA/Fanying_NEPC_CRPC_ATAC-seq/original_data", full.names = TRUE,pattern = "*.\\.bam$")
bam_park <- dir("/athena/khuranalab/scratch/kac2053/projects/ctDNA/GSE118207/bam", full.names = TRUE,pattern = "*rmdups.bam$")
bam_GM12878 <- dir("/athena/khuranalab/scratch/kac2053/projects/ctDNA/GSE47753/bam", full.names = TRUE,pattern = "SRR891274_rmdups.bam$")
bamsToCount <- c(bam_fanying, bam_park, bam_GM12878)
#indexBam(bamsToCount)
regionsToCount <- data.frame(GeneID=paste("ID",seqnames(consensusToCount),start(consensusToCount),end(consensusToCount),sep="_"),Chr=seqnames(consensusToCount),Start=start(consensusToCount),End=end(consensusToCount),Strand=strand(consensusToCount))
fcResults <- featureCounts(bamsToCount,annot.ext=regionsToCount,isPairedEnd = TRUE,countMultiMappingReads = FALSE,maxFragLength=100)
myCounts <- fcResults$counts
colnames(myCounts) <- c("C4.2_1","C4.2_2","H660_1","H660_2","VCaP_1", "VCaP_2", "MSKCC-EF1", "H660_Park", "GM12878_50k_1", "GM12878_50k_2", "GM12878_50k_3", "GM12878_50k_4", "GM12878_500_1", "GM12878_500_2", "GM12878_500_3")
save(myCounts,file=paste0("countsFromATAC.RData"))
```
## DESeq2 for differential ATAC-seq.
With our counts of fragments in nucleosome free regions we can now contruct a DESeq2 object and perform a PCA again but this time using signal within peaks, not just occurrence in regions.
We pass the GRanges of regions we count to DESeqDataSetFromMatrix function so as to access these from DESeq2 later.
```{r processData_DEseq2_PCA, echo=TRUE,eval=TRUE,cache=TRUE}
library(DESeq2)
library(ggrepel)
load(paste0(sample_dir,"countsFromATAC.RData"))
metaData <- data.frame(Group,row.names=colnames(myCounts))
atacDDS <- DESeqDataSetFromMatrix(myCounts,metaData,~Group,rowRanges=consensusToCount)
atacDDS <- DESeq(atacDDS) #view atacDDS with > counts(atacDDS, normalized=TRUE)
atac_Rlog <- rlog(atacDDS)
pca_signal <- plotPCA(atac_Rlog,intgroup="CRPCvsNEPCvslymphoblastoid",ntop=nrow(atac_Rlog)) + geom_label_repel(label = rownames(colData(atac_Rlog)), nudge_y=50, nudge_x=50)
savePlot(pca_signal, "pca_signal")
save(atacDDS,file=paste0("atacDDS_Park_Fanying_GM12878_CRPCvsNEPCvslymphoblastoid.RData"))
save(atac_Rlog,file=paste0("atac_Rlog_Park_Fanying_GM12878_CRPCvsNEPCvslymphoblastoid.RData"))
```
Plot unsupervised heatmap. - Karen Chu 10/29/18
```{r heatmap, echo=TRUE,eval=TRUE,cache=TRUE}
library(ComplexHeatmap)
load(paste0("atacDDS_Park_Fanying_GM12878_CRPCvsNEPCvslymphoblastoid.RData"))
load(paste0("atac_Rlog_Park_Fanying_GM12878_CRPCvsNEPCvslymphoblastoid.RData"))
atac_heatmap <- counts(atacDDS, normalized=TRUE)
atac_var_heatmap <- apply(atac_heatmap,1,var)
var_regions <- sort(atac_var_heatmap, decreasing = TRUE)
var_regions <- var_regions[1:1000] #Top 1000 variance regions.
# Subset atac-seq data to contain only top 1000 variance regions.
atac_1000var_heatmap <- subset(atac_heatmap, rownames(atac_heatmap) %in% names(var_regions) )
atac_log_heatmap <- log2(atac_1000var_heatmap+1)
pdf(paste0("differential_accessibility_CRPCvsNEPCvslymphoblastoid.pdf"))
Heatmap(atac_log_heatmap,column_title = "top 1000 variant regions", name = "normalized peak signal", show_row_names = FALSE)
dev.off()
#savePlot(atac_heatmap, "diffaccess_heatmap_prostatevslymph")
```