Skip to content

Latest commit

 

History

History
610 lines (493 loc) · 21.6 KB

coverage_rep_RNAseq_test.md

File metadata and controls

610 lines (493 loc) · 21.6 KB

coverage_rep_RNAseq_test

Ruijuan Li
4/15/2017

# goal: 
# determine the most cost effecient coverage (1 VS 3/4 VS 1/2 VS 1/4) & rep (3 VS 2 VS 1) # for illumina RNAseq by checking the correlation of glmfit fitted values for reduced dataset with the full dataset (3 reps & full coverage)

# Trimmomatic trimming --> STAR mapping --> ReadsPerGene.out.tab    

function

library(edgeR)
## Loading required package: limma
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
##### with reps 
get.fitted.value <- function(reads.formated){

  # check test data type 
  gene_num_dim <- dim(reads.formated) 
  cat(cat(gene_num_dim[1]) , cat(" genes ") , cat("exsited in this dataset with "), cat(gene_num_dim[2]/2), cat(" reps")) 
  cat("\n") 
  
  # set up sample description 
  reads.formated.sample <-data.frame(file=colnames(reads.formated),
                               genotype=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\1",colnames(reads.formated))),	
                               stage=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\3",colnames(reads.formated))),	
                               group=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\1\\3",colnames(reads.formated)))
  )
  
  # filter based on read count 
  cat("number of expressed genes if filtered based greater than 10 reads in at least 3 libs:")
  reads.formated.small <- reads.formated[rowSums(reads.formated > 10) >= 3,]
  cat(dim(reads.formated.small)[1])
  cat("\n")
  
  # normalize 
  dge.reads.formated <- DGEList(counts=reads.formated, group=reads.formated.sample$group)
  dge.reads.formated <- calcNormFactors(dge.reads.formated, method = "TMM") 
  
  # pairwise comparison using GLM model 
  # design matrix # w/o batch effect
  design.reads.formated <- model.matrix(~0+group, data = reads.formated.sample)
  dge.reads.formated <- estimateGLMCommonDisp(dge.reads.formated, design.reads.formated,verbose = F) # Disp = 0.13675 , BCV = 0.3698
  dge.reads.formated <- estimateGLMTrendedDisp(dge.reads.formated,design.reads.formated)
  dge.reads.formated <- estimateGLMTagwiseDisp(dge.reads.formated,design.reads.formated)
  
  ## fit model & look for DEGs
  fit.reads.formated <- glmFit(dge.reads.formated, design.reads.formated)
  
  # colnames(fit.flowering.halfhalf$coefficients)
  lrt.reads.formated <- glmLRT(fit.reads.formated, contrast = c(1, -1))
  number.reads.formated <- summary(de.reads.formated <- decideTestsDGE(lrt.reads.formated, p=0.05))
  DEgene.reads.formated <- topTags(lrt.reads.formated,n = Inf)$table[topTags(lrt.reads.formated,n = Inf)$table$FDR<0.05,]
  
  cat("number of differentially expressed genes between genotypes:")
  cat(dim(DEgene.reads.formated)[1])
  cat("\n")
  
  return(fit.reads.formated)
} 

##### w/o rep 
get.fitted.value.no.rep <- function(reads.formated){

  # check test data type 
  gene_num_dim <- dim(reads.formated) 
  cat(cat(gene_num_dim[1]) , cat(" genes ") , cat("exsited in this dataset with "), cat(gene_num_dim[2]/2), cat(" reps")) 
  cat("\n") 
  
  # set up sample description 
  reads.formated.sample <-data.frame(file=colnames(reads.formated),
                               genotype=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\1",colnames(reads.formated))),	
                               stage=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\3",colnames(reads.formated))),	
                               group=factor(gsub("([[:print:]]+)(_)([[:print:]]+)(_)(1|2|3)","\\1\\3",colnames(reads.formated)))
  )
  
  # filter based on read count 
  cat("number of expressed genes if filtered based greater than 10 reads in 2 libs:")
  reads.formated.small <- reads.formated[rowSums(reads.formated > 10) >= 2,]
  cat(dim(reads.formated.small)[1])
  cat("\n")
  
  # normalize 
  dge.reads.formated <- DGEList(counts=reads.formated, group=reads.formated.sample$group)
  dge.reads.formated <- calcNormFactors(dge.reads.formated, method = "TMM") 
  
  # pairwise comparison using GLM model 
  # design matrix # w/o batch effect
  design.reads.formated <- model.matrix(~0+group, data = reads.formated.sample)
  # pick a reasonable dispersion value for data w/o rep to account for biological varaibility ... 
  bcv <- 0.4 # pick 0.4 because this is close to the value I get from 3 reps 
  dge.reads.formated <- estimateGLMCommonDisp(dge.reads.formated, design.reads.formated,verbose = F) # Disp = 0.13675 , BCV = 0.3698
  cat("set common dispersion to 0.4\n")  
  dge.reads.formated$common.dispersion <- bcv ^ 2
  # dge.reads.formated <- estimateGLMTrendedDisp(dge.reads.formated,design.reads.formated)
  # dge.reads.formated <- estimateGLMTagwiseDisp(dge.reads.formated,design.reads.formated)

  ## fit model & look for DEGs
  fit.reads.formated <- glmFit(dge.reads.formated, design.reads.formated)
  
  # colnames(fit.flowering.halfhalf$coefficients)
  lrt.reads.formated <- glmLRT(fit.reads.formated, contrast = c(1, -1))
  number.reads.formated <- summary(de.reads.formated <- decideTestsDGE(lrt.reads.formated, p=0.05))
  DEgene.reads.formated <- topTags(lrt.reads.formated,n = Inf)$table[topTags(lrt.reads.formated,n = Inf)$table$FDR<0.05,]
  
  cat("number of differentially expressed genes between genotypes:")
  cat(dim(DEgene.reads.formated)[1])
  cat("\n")
  
  return(fit.reads.formated) 
}  

coverage test (all VS 3/4 VS 1/2 VS 1/4)

######### all trimmed data 
flowering.read.count.all <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.all.tsv", header = T, check.names = F)
rownames(flowering.read.count.all) <- flowering.read.count.all[,1]
flowering.read.count.all <- flowering.read.count.all[,-1]

# format data 
head(flowering.read.count.all)
##                 2   6 Ae_Gae_2 Ae_Gae_3 All1_Gae_2 All1_Gae_3
## BnaC09g12820D  25  43       14       18          0          8
## BnaC09g12810D 341 295      544      398        374        519
## BnaC09g12800D 102  74       66       51         75         71
## BnaC09g12790D   0   0        0        0          0          0
## BnaC09g12780D 129  99       59       69         63         78
## BnaC09g12770D 112 109       65       50         65         69
dim(flowering.read.count.all) # 101040     6 
## [1] 101040      6
colnames(flowering.read.count.all)
## [1] "2"          "6"          "Ae_Gae_2"   "Ae_Gae_3"   "All1_Gae_2"
## [6] "All1_Gae_3"
# replace sample ID 
colnames(flowering.read.count.all) <- c("Da-Ol-1_flowering_3","Da-Ae_flowering_3","Da-Ae_flowering_1","Da-Ae_flowering_2","Da-Ol-1_flowering_1","Da-Ol-1_flowering_2")
head(flowering.read.count.all)
##               Da-Ol-1_flowering_3 Da-Ae_flowering_3 Da-Ae_flowering_1
## BnaC09g12820D                  25                43                14
## BnaC09g12810D                 341               295               544
## BnaC09g12800D                 102                74                66
## BnaC09g12790D                   0                 0                 0
## BnaC09g12780D                 129                99                59
## BnaC09g12770D                 112               109                65
##               Da-Ae_flowering_2 Da-Ol-1_flowering_1 Da-Ol-1_flowering_2
## BnaC09g12820D                18                   0                   8
## BnaC09g12810D               398                 374                 519
## BnaC09g12800D                51                  75                  71
## BnaC09g12790D                 0                   0                   0
## BnaC09g12780D                69                  63                  78
## BnaC09g12770D                50                  65                  69
fitted.all.3 <- get.fitted.value(flowering.read.count.all)
## 101040 genes exsited in this dataset with 3 reps
## number of expressed genes if filtered based greater than 10 reads in at least 3 libs:48968
## number of differentially expressed genes between genotypes:7338
##### half of the reads 
flowering.read.count.half <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.half.tsv", header = T, check.names = F)
rownames(flowering.read.count.half) <- flowering.read.count.half[,1]
flowering.read.count.half <- flowering.read.count.half[,-1]

# format data 
head(flowering.read.count.half)
##                 2   6 Ae_Gae_2 Ae_Gae_3 All1_Gae_2 All1_Gae_3
## BnaC09g12820D   9  25        9        7          0          3
## BnaC09g12810D 179 156      303      201        184        260
## BnaC09g12800D  47  35       33       33         35         36
## BnaC09g12790D   0   0        0        0          0          0
## BnaC09g12780D  56  54       30       31         30         42
## BnaC09g12770D  66  51       38       21         35         43
dim(flowering.read.count.half) # 101040     6 
## [1] 101040      6
colnames(flowering.read.count.half)
## [1] "2"          "6"          "Ae_Gae_2"   "Ae_Gae_3"   "All1_Gae_2"
## [6] "All1_Gae_3"
# replace sample ID 
colnames(flowering.read.count.half) <- c("Da-Ol-1_flowering_3","Da-Ae_flowering_3","Da-Ae_flowering_1","Da-Ae_flowering_2","Da-Ol-1_flowering_1","Da-Ol-1_flowering_2")
head(flowering.read.count.half)
##               Da-Ol-1_flowering_3 Da-Ae_flowering_3 Da-Ae_flowering_1
## BnaC09g12820D                   9                25                 9
## BnaC09g12810D                 179               156               303
## BnaC09g12800D                  47                35                33
## BnaC09g12790D                   0                 0                 0
## BnaC09g12780D                  56                54                30
## BnaC09g12770D                  66                51                38
##               Da-Ae_flowering_2 Da-Ol-1_flowering_1 Da-Ol-1_flowering_2
## BnaC09g12820D                 7                   0                   3
## BnaC09g12810D               201                 184                 260
## BnaC09g12800D                33                  35                  36
## BnaC09g12790D                 0                   0                   0
## BnaC09g12780D                31                  30                  42
## BnaC09g12770D                21                  35                  43
fitted.half.3 <- get.fitted.value(flowering.read.count.half)
## 101040 genes exsited in this dataset with 3 reps
## number of expressed genes if filtered based greater than 10 reads in at least 3 libs:41412
## number of differentially expressed genes between genotypes:6006
##### 1/4 of the reads 
flowering.read.count.halfhalf <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.halfhalf.tsv", header = T, check.names = F)
rownames(flowering.read.count.halfhalf) <- flowering.read.count.halfhalf[,1]
flowering.read.count.halfhalf <- flowering.read.count.halfhalf[,-1]

# format data 
head(flowering.read.count.halfhalf)
##                2  6 Ae_Gae_2 Ae_Gae_3 All1_Gae_2 All1_Gae_3
## BnaC09g12820D  6 15        4        5          0          2
## BnaC09g12810D 84 77      156      117         86        144
## BnaC09g12800D 25 19       13       20         21         13
## BnaC09g12790D  0  0        0        0          0          0
## BnaC09g12780D 33 26       13       11         13         29
## BnaC09g12770D 33 23       18       11         16         26
dim(flowering.read.count.halfhalf) # 101040     6 
## [1] 101040      6
colnames(flowering.read.count.halfhalf)
## [1] "2"          "6"          "Ae_Gae_2"   "Ae_Gae_3"   "All1_Gae_2"
## [6] "All1_Gae_3"
# replace sample ID 
colnames(flowering.read.count.halfhalf) <- c("Da-Ol-1_flowering_3","Da-Ae_flowering_3","Da-Ae_flowering_1","Da-Ae_flowering_2","Da-Ol-1_flowering_1","Da-Ol-1_flowering_2") 
head(flowering.read.count.halfhalf)
##               Da-Ol-1_flowering_3 Da-Ae_flowering_3 Da-Ae_flowering_1
## BnaC09g12820D                   6                15                 4
## BnaC09g12810D                  84                77               156
## BnaC09g12800D                  25                19                13
## BnaC09g12790D                   0                 0                 0
## BnaC09g12780D                  33                26                13
## BnaC09g12770D                  33                23                18
##               Da-Ae_flowering_2 Da-Ol-1_flowering_1 Da-Ol-1_flowering_2
## BnaC09g12820D                 5                   0                   2
## BnaC09g12810D               117                  86                 144
## BnaC09g12800D                20                  21                  13
## BnaC09g12790D                 0                   0                   0
## BnaC09g12780D                11                  13                  29
## BnaC09g12770D                11                  16                  26
fitted.quarter.3 <- get.fitted.value(flowering.read.count.halfhalf)
## 101040 genes exsited in this dataset with 3 reps
## number of expressed genes if filtered based greater than 10 reads in at least 3 libs:32179
## number of differentially expressed genes between genotypes:4597
### 3/4 
flowering.read.count.3_4 <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.3_4.tsv", header = T, check.names = F)
rownames(flowering.read.count.3_4) <- flowering.read.count.3_4[,1]
flowering.read.count.3_4 <- flowering.read.count.3_4[,-1]

# format data 
head(flowering.read.count.3_4)
##                 2   6 Ae_Gae_2 Ae_Gae_3 All1_Gae_2 All1_Gae_3
## BnaC09g12820D  18  35       12       11          0          6
## BnaC09g12810D 267 234      419      296        273        388
## BnaC09g12800D  75  50       54       43         52         49
## BnaC09g12790D   0   0        0        0          0          0
## BnaC09g12780D  90  70       46       49         50         60
## BnaC09g12770D  84  84       50       40         49         56
dim(flowering.read.count.3_4) # 101040     6 
## [1] 101040      6
colnames(flowering.read.count.3_4)
## [1] "2"          "6"          "Ae_Gae_2"   "Ae_Gae_3"   "All1_Gae_2"
## [6] "All1_Gae_3"
# replace sample ID 
colnames(flowering.read.count.3_4) <- c("Da-Ol-1_flowering_3","Da-Ae_flowering_3","Da-Ae_flowering_1","Da-Ae_flowering_2","Da-Ol-1_flowering_1","Da-Ol-1_flowering_2")
head(flowering.read.count.3_4)
##               Da-Ol-1_flowering_3 Da-Ae_flowering_3 Da-Ae_flowering_1
## BnaC09g12820D                  18                35                12
## BnaC09g12810D                 267               234               419
## BnaC09g12800D                  75                50                54
## BnaC09g12790D                   0                 0                 0
## BnaC09g12780D                  90                70                46
## BnaC09g12770D                  84                84                50
##               Da-Ae_flowering_2 Da-Ol-1_flowering_1 Da-Ol-1_flowering_2
## BnaC09g12820D                11                   0                   6
## BnaC09g12810D               296                 273                 388
## BnaC09g12800D                43                  52                  49
## BnaC09g12790D                 0                   0                   0
## BnaC09g12780D                49                  50                  60
## BnaC09g12770D                40                  49                  56
fitted.three.quarter.3 <- get.fitted.value(flowering.read.count.3_4) 
## 101040 genes exsited in this dataset with 3 reps
## number of expressed genes if filtered based greater than 10 reads in at least 3 libs:45933
## number of differentially expressed genes between genotypes:6825

rep test: 2 reps & no rep

##### two reps 
flowering.read.count.two.rep <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.two.rep.tsv", header = T, check.names = F)
rownames(flowering.read.count.two.rep) <- flowering.read.count.two.rep[,1]
flowering.read.count.two.rep <- flowering.read.count.two.rep[,-1]

# format data 
head(flowering.read.count.two.rep)
##                 2   6 Ae_Gae_2 All1_Gae_2
## BnaC09g12820D  25  43       14          0
## BnaC09g12810D 341 295      544        374
## BnaC09g12800D 102  74       66         75
## BnaC09g12790D   0   0        0          0
## BnaC09g12780D 129  99       59         63
## BnaC09g12770D 112 109       65         65
dim(flowering.read.count.two.rep) # 101040     6 
## [1] 101040      4
colnames(flowering.read.count.two.rep)
## [1] "2"          "6"          "Ae_Gae_2"   "All1_Gae_2"
# replace sample ID 
colnames(flowering.read.count.two.rep) <- c("Da-Ol-1_flowering_1","Da-Ae_flowering_1","Da-Ae_flowering_2","Da-Ol-1_flowering_2")
head(flowering.read.count.two.rep)
##               Da-Ol-1_flowering_1 Da-Ae_flowering_1 Da-Ae_flowering_2
## BnaC09g12820D                  25                43                14
## BnaC09g12810D                 341               295               544
## BnaC09g12800D                 102                74                66
## BnaC09g12790D                   0                 0                 0
## BnaC09g12780D                 129                99                59
## BnaC09g12770D                 112               109                65
##               Da-Ol-1_flowering_2
## BnaC09g12820D                   0
## BnaC09g12810D                 374
## BnaC09g12800D                  75
## BnaC09g12790D                   0
## BnaC09g12780D                  63
## BnaC09g12770D                  65
fitted.all.two.rep <- get.fitted.value(flowering.read.count.two.rep)
## 101040 genes exsited in this dataset with 2 reps
## number of expressed genes if filtered based greater than 10 reads in at least 3 libs:44554
## number of differentially expressed genes between genotypes:4790
##### no rep 
flowering.read.count.no.rep <- read.table("/Users/ruijuanli/Desktop/Brassica_project/KIAT_RNA_seq/read_count/read.count.flowering.no.rep.tsv", header = T, check.names = F)
rownames(flowering.read.count.no.rep) <- flowering.read.count.no.rep[,1]
flowering.read.count.no.rep <- flowering.read.count.no.rep[,-1]

# format data 
head(flowering.read.count.no.rep)
##                 2   6
## BnaC09g12820D  25  43
## BnaC09g12810D 341 295
## BnaC09g12800D 102  74
## BnaC09g12790D   0   0
## BnaC09g12780D 129  99
## BnaC09g12770D 112 109
dim(flowering.read.count.no.rep) # 101040     6 
## [1] 101040      2
colnames(flowering.read.count.no.rep)
## [1] "2" "6"
# replace sample ID 
colnames(flowering.read.count.no.rep) <- c("Da-Ol-1_flowering_1","Da-Ae_flowering_1")
head(flowering.read.count.no.rep)
##               Da-Ol-1_flowering_1 Da-Ae_flowering_1
## BnaC09g12820D                  25                43
## BnaC09g12810D                 341               295
## BnaC09g12800D                 102                74
## BnaC09g12790D                   0                 0
## BnaC09g12780D                 129                99
## BnaC09g12770D                 112               109
fitted.all.no.rep <- get.fitted.value.no.rep(flowering.read.count.no.rep) 
## 101040 genes exsited in this dataset with 1 reps
## number of expressed genes if filtered based greater than 10 reads in 2 libs:44151
## Warning in estimateGLMCommonDisp.default(y = y$counts, design = design, :
## No residual df: setting dispersion to NA
## set common dispersion to 0.4
## number of differentially expressed genes between genotypes:5067

correlation test for the fitted coefficient

# fitted coefficient 
ref.vs.rep2 <- cor(fitted.all.3$coefficients, fitted.all.two.rep$coefficients)
ref.vs.rep1 <- cor(fitted.all.3$coefficients, fitted.all.no.rep$coefficients)
ref.vs.three.quarter <- cor(fitted.all.3$coefficients, fitted.three.quarter.3$coefficients)
ref.vs.one.quarter <- cor(fitted.all.3$coefficients, fitted.quarter.3$coefficients)
ref.vs.half <- cor(fitted.all.3$coefficients, fitted.half.3$coefficients)

ref.vs.rep2 <- ref.vs.rep2[1,]
ref.vs.rep1 <- ref.vs.rep1[1,]
ref.vs.three.quarter <- ref.vs.three.quarter[1,]
ref.vs.one.quarter <- ref.vs.one.quarter[1,]
ref.vs.half <- ref.vs.half[1,]

coefficients.fit <- as.data.frame(rbind(ref.vs.three.quarter, ref.vs.rep2, ref.vs.half, ref.vs.rep1, ref.vs.one.quarter))
coefficients.fit$value <- as.numeric(rowMeans(coefficients.fit))
coefficients.fit
##                      groupDa-Aeflowering groupDa-Ol-1flowering     value
## ref.vs.three.quarter           0.9971269             0.9320508 0.9645889
## ref.vs.rep2                    0.9948364             0.9274069 0.9611217
## ref.vs.half                    0.9922013             0.9295043 0.9608528
## ref.vs.rep1                    0.9641464             0.9138257 0.9389861
## ref.vs.one.quarter             0.9798733             0.9216243 0.9507488
coefficients.fit$type <- factor(rownames(coefficients.fit), levels = c("ref.vs.three.quarter", "ref.vs.rep2", "ref.vs.half", "ref.vs.one.quarter", "ref.vs.rep1"))

# plot 
p.coefficient <- ggplot(data = coefficients.fit)
p.coefficient <- p.coefficient + geom_count(mapping = aes(x = factor(coefficients.fit$type), y = value), stat = "identity")
p.coefficient <- p.coefficient + theme(axis.text.x=element_text(angle=90),strip.text.y = element_text(angle=0)) 
p.coefficient <- p.coefficient + labs(y = "coefficient correlation", x = "")

p.coefficient 

ggsave(p.coefficient, filename = "~/Desktop/Brassica_project/KIAT_RNA_seq/output/p.coefficients.png", width = 4, height = 4)