From f843bceabd8e662de720abc4122b0997854d4258 Mon Sep 17 00:00:00 2001 From: LynnLy Date: Thu, 8 Feb 2018 19:08:46 -0800 Subject: [PATCH 1/5] Measuring system time --- F1/MBASED_F1.rmd | 65 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd index 2297950..593d6b1 100644 --- a/F1/MBASED_F1.rmd +++ b/F1/MBASED_F1.rmd @@ -4,17 +4,17 @@ author: Lynn Ly output: html_document --- -Author comment -File description comment, including purpose of program, inputs, and outputs -source() and library() statements -Function definitions -Executed statements, if applicable (e.g., print, plot) - -Purpose: This uses the MBASED package to analyze allele specific expression. - Single sample analysis: Find genes that display allelic imbalance ie. not expressed 0.5/0.5. Default threshold: 0.7 - Two sample analysis: Find genes that display different ASE ratios between two samples. Doesn't matter what the actual ratios are, as long as they differ by over 0.2 -Inputs: Pre-filtered VCF file, simplified gff file with just ranges and feature name -Outputs: major allele frequencies, frequency differences, list of genes displaying ASE +Author comment +File description comment, including purpose of program, inputs, and outputs +source() and library() statements +Function definitions +Executed statements, if applicable (e.g., print, plot) + +Purpose: This uses the MBASED package to analyze allele specific expression. + Single sample analysis: Find genes that display allelic imbalance ie. not expressed 0.5/0.5. Default threshold: 0.7 + Two sample analysis: Find genes that display different ASE ratios between two samples with the SAME HAPLOTYPE. Doesn't matter what the actual ratios are, as long as they differ by over 0.2 +Inputs: Pre-filtered VCF file, simplified gff file with just ranges and feature name +Outputs: major allele frequencies, frequency differences, list of genes displaying ASE ```{r setup, include=FALSE} library(MBASED) @@ -64,7 +64,7 @@ SummarizeASEResults_1s <- function(MBASEDOutput) { pValueHeterogeneity = assays(MBASEDOutput)$pValueHeterogeneity[,1]) lociOutputGR <- rowRanges(metadata(MBASEDOutput)$locusSpecificResults) - lociOutputGR$allele1IsMajor <- assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor[,1] + lociOutputGR$allele1IsMajor <- assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor[,1] lociOutputGR$MAF <- assays(metadata(MBASEDOutput)$locusSpecificResults)$MAF[,1] lociOutputList <- split(lociOutputGR, factor(lociOutputGR$aseID, levels = unique(lociOutputGR$aseID))) return(list(geneOutput=geneOutputDF, locusOutput=lociOutputList)) @@ -78,7 +78,9 @@ ExtractASE <- function(MBASEDOutput) { results <- SummarizeASEResults_1s(MBASEDOutput) - ASEindexes <- results$geneOutput$pValueASE * 46577 < 0.05 & + adjustedP <- p.adjust(results$geneOutput$pValueASE, method = "BH", n = length(results$geneOutput$pValueASE)) + + ASEindexes <- adjustedP < 0.05 & results$geneOutput$majorAlleleFrequency > 0.7 significantResults <- list(results$geneOutput[ASEindexes, ], @@ -121,18 +123,25 @@ MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol") save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata") ``` -```{r Data to use} -# Change the following line if not using B. napus +```{r Data Filtering} gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA") -# Data to use +# Data to use (basicallly, VCF data pre-filtered for quality) load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA) # Remove SNVs with no associated genes annotatedData <- filter(annotatedData, !is.na(GeneID)) - + +# The following filters are applied to avoid spurious ASE calls. + +# Require at least 5 reads and at least 10% of all reads support each allele + +# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated) + +# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. + mySNVs <- GRanges( seqnames = annotatedData$CHROM, ranges = IRanges(start = annotatedData$POS, width = 1), @@ -143,7 +152,7 @@ mySNVs <- GRanges( names(mySNVs) <- annotatedData$GeneID ``` -In Progress: TwoSample analysis +In Progress: TwoSample analysis ```{r Two Sample Function} TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){ @@ -163,16 +172,29 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){ MBASEDOutput <- runMBASED( ASESummarizedExperiment = mySample, isPhased = FALSE, - numSim = 0, + numSim = 10 #BPPARAM = SerialParam() # Default: No paralellization ) return(MBASEDOutput) -} +} + ``` ```{r Run Two Sample} -MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData, mySNVs, "F1_414", "F1_415") +# Estimating two sample run time using smaller datasets +annotatedData.trimmed <- annotatedData[400:600 ,] + +mySNVs.trimmed <- GRanges( + seqnames = annotatedData.trimmed$CHROM, + ranges = IRanges(start = annotatedData.trimmed$POS, width = 1), + aseID = as.vector(annotatedData.trimmed$GeneID), + allele1 = annotatedData.trimmed$REF, + allele2 = annotatedData.trimmed$ALT) + +time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415")) + +# 100: 9 seconds ``` @@ -197,4 +219,3 @@ dim(sig.Ae[[1]]) #11995 SNVs found to be in ASE sig.Ol <- ExtractASE(MBASED.Ol) dim(sig.Ol[[1]]) #11990 SNVs found to be in ASE ``` - From 522bb6fcebcf9cf64177093929b7b616ac49f989 Mon Sep 17 00:00:00 2001 From: LynnLy Date: Thu, 8 Feb 2018 23:04:46 -0800 Subject: [PATCH 2/5] Finished phasing, added filtering. To do: Finish rangeFilter --- F1/MBASED_F1.rmd | 176 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 128 insertions(+), 48 deletions(-) diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd index 593d6b1..862817b 100644 --- a/F1/MBASED_F1.rmd +++ b/F1/MBASED_F1.rmd @@ -22,6 +22,7 @@ library(tidyverse) ``` ```{r AnnotateSNPs Function} +# Modified from Ruijuan's function in helpler.R AnnotateSNPs <- function(SNPdata, gff.mRNA){ # Combines SNP/SNV loci with gene names # @@ -73,11 +74,10 @@ SummarizeASEResults_1s <- function(MBASEDOutput) { ExtractASE <- function(MBASEDOutput) { # Extract only desired genes # Modify ASEindexes to vary the strictness of selection. - # TODO: Implement Benjamini-Hochberg correction (punish p-values based on rank) - # Currently using Bonferroni as a crude correction measure (punishes all p-values equally) results <- SummarizeASEResults_1s(MBASEDOutput) + # Apply Benjamini-Hochberg (fdr) correction for multiple testing adjustedP <- p.adjust(results$geneOutput$pValueASE, method = "BH", n = length(results$geneOutput$pValueASE)) ASEindexes <- adjustedP < 0.05 & @@ -89,10 +89,10 @@ ExtractASE <- function(MBASEDOutput) { } ``` -```{r Single Sample Function} +```{r Single and Two Sample Function} # Please change numSim to at least 1,000,000 for final analysis -SingleSample <- function(annotatedData, mySNVs, genotype){ +SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){ # create RangedSummarizedExperiment object as input for runMBASED # then runMBASED @@ -107,41 +107,139 @@ SingleSample <- function(annotatedData, mySNVs, genotype){ MBASEDOutput <- runMBASED( ASESummarizedExperiment = mySample, - numSim = 1000, - isPhased = FALSE) + numSim = numSim, + isPhased = TRUE) return(MBASEDOutput) } -MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414") -save(MBASED.F1.414, file = "MBASED.F1.414.Rdata") -MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415") -save(MBASED.F1.414, file = "MBASED.F1.415.Rdata") -MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae") -save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata") -MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol") -save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata") +TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){ + RO1 <- paste(genotype1, "RO", sep = "_") + AO1 <- paste(genotype1, "AO", sep = "_") + RO2 <- paste(genotype2, "RO", sep = "_") + AO2 <- paste(genotype2, "AO", sep = "_") + + mySample <- SummarizedExperiment( + assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2, + dimnames = list(names(mySNVs), c(genotype1, genotype2))), + + lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2, + dimnames = list(names(mySNVs), c(genotype1, genotype2)))), + rowRanges=mySNVs) + + MBASEDOutput <- runMBASED( + ASESummarizedExperiment = mySample, + isPhased = TRUE, + numSim = numSim + #BPPARAM = SerialParam() # Default: No paralellization + ) + + return(MBASEDOutput) +} ``` -```{r Data Filtering} +```{r Phasing Data} gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA") # Data to use (basicallly, VCF data pre-filtered for quality) load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") -annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA) +inputVCF <- F1.young.GQ.filtered +inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ] +inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ] + +# Switch the Ref and Alt alleles, and change the counts accordingly +colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF") +colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO", + "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO") + +phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol) + +attach(phasedData.unordered) +phasedData <- phasedData.unordered[order(CHROM, POS), ] +detach(phasedData.unordered) +``` + +```{r Filtering Functions} + +CoverageFilter <- function(vcf) { + # Modified from Ruijuan's function in helpler.R + # Require at least 10 reads and at least 10% of all reads support each allele + + totalReads <- vcf$F1_414_DP + alleles <- vcf[, c(19, 23)] + vcf.filtered <- vcf[which(apply(alleles / totalReads, 1, min) > .10), ] + + totalReads <- vcf.filtered$F1_415_DP + alleles <- vcf.filtered[, c(20, 24)] + vcf.filtered <- vcf.filtered[which(apply(alleles / totalReads, 1, min) > .10), ] + + return(vcf.filtered) +} + +RangeFilter <- function(vcf) { + # Discard SNVs that are within 10 bp of another called SNV. + # This is necessary because if one read spans both SNVs, then our counts are not independent. + # There is probably a better way to do this... + + SNV.ranges <- GRanges(seqnames = Rle(SNPdata$CHROM), + ranges = IRanges(start = SNPdata$POS - 5, end = SNPdata$POS + 5), + CHROM = SNPdata$CHROM, + POS = SNPdata$POS) + + overlappedGenes <- mergeByOverlaps(SNV.ranges, SNV.ranges) + + # Get only the 10bp overlaps that were not with itself + actualOverlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),] + + a <- as.data.frame(actualOverlappedGenes)[, c(8, 9, 17, 18)] + + # We don't want to get rid of all the SNVs though. We want to keep as many SNVs as possible that aren't overlapping. + # idea for tmrw morning: combine chrom and pos. + i <- 1 + while(i <= nrow(a)) { + if (a$POS[i] %in% a$POS.1) { + a <- a[-i, ] + } else { + i <- i + 1 + } + } + + rangeFilteredData <- anti_join(SNPdata, as.data.frame(actualOverlappedGenes), by = c("CHROM", "POS")) +} +``` + + +```{r Annotating and Filtering Data} +annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA) # Remove SNVs with no associated genes annotatedData <- filter(annotatedData, !is.na(GeneID)) +dim(annotatedData) # The following filters are applied to avoid spurious ASE calls. # Require at least 5 reads and at least 10% of all reads support each allele +annotatedData <- CoverageFilter(annotatedData) +dim(annotatedData) # Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated) +annotatedData <- RangeFilter(annotatedData) +dim(annotatedData) + +# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. + +``` +```{r Estimate Overdispersion and Reference Allele Bias} +# If our data presents any, we should include these metrics when creating mySNVs. +# Use the additional functions in MBASED + +``` + -# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. + +```{r Single Sample Analysis} mySNVs <- GRanges( seqnames = annotatedData$CHROM, ranges = IRanges(start = annotatedData$POS, width = 1), @@ -150,40 +248,22 @@ mySNVs <- GRanges( allele2 = annotatedData$ALT) names(mySNVs) <- annotatedData$GeneID -``` - -In Progress: TwoSample analysis - -```{r Two Sample Function} -TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){ - RO1 <- paste(genotype1, "RO", sep = "_") - AO1 <- paste(genotype1, "AO", sep = "_") - RO2 <- paste(genotype2, "RO", sep = "_") - AO2 <- paste(genotype2, "AO", sep = "_") - - mySample <- SummarizedExperiment( - assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2, - dimnames = list(names(mySNVs), c(genotype1, genotype2))), - - lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2, - dimnames = list(names(mySNVs), c(genotype1, genotype2)))), - rowRanges=mySNVs) - - MBASEDOutput <- runMBASED( - ASESummarizedExperiment = mySample, - isPhased = FALSE, - numSim = 10 - #BPPARAM = SerialParam() # Default: No paralellization - ) - - return(MBASEDOutput) -} +MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0) +save(MBASED.F1.414, file = "MBASED.F1.414.Rdata") +MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0) +save(MBASED.F1.414, file = "MBASED.F1.415.Rdata") +MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae", numSim = 0) +save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata") +MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol", numSim = 0) +save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata") ``` -```{r Run Two Sample} +In Progress: Two Sample analysis + +```{r Two Sample Analysis} # Estimating two sample run time using smaller datasets -annotatedData.trimmed <- annotatedData[400:600 ,] +annotatedData.trimmed <- annotatedData[1:500 ,] mySNVs.trimmed <- GRanges( seqnames = annotatedData.trimmed$CHROM, @@ -194,6 +274,7 @@ mySNVs.trimmed <- GRanges( time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415")) +time100.10 # 100: 9 seconds ``` @@ -208,7 +289,6 @@ sig.F1.414 <- ExtractASE(MBASED.F1.414) dim(sig.F1.414[[1]]) #1946 SNVs found to be in ASE summary(sig.F1.414[[2]]) - sig.F1.415 <- ExtractASE(MBASED.F1.415) dim(sig.F1.415[[1]]) #2164 SNVs found to be in ASE head(sig.F1.415[[2]]$GeneID) From 25ee524c407c0f47e670342f37c97cea4bc69631 Mon Sep 17 00:00:00 2001 From: LynnLy Date: Tue, 13 Feb 2018 13:43:07 -0800 Subject: [PATCH 3/5] Working on dispersion practice --- F1/MBASED_F1.rmd | 176 ++++++++++++++++++++------------- F1/overdispersion_practice.Rmd | 174 ++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+), 66 deletions(-) create mode 100644 F1/overdispersion_practice.Rmd diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd index 862817b..85ac82e 100644 --- a/F1/MBASED_F1.rmd +++ b/F1/MBASED_F1.rmd @@ -92,7 +92,7 @@ ExtractASE <- function(MBASEDOutput) { ```{r Single and Two Sample Function} # Please change numSim to at least 1,000,000 for final analysis -SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){ +SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0, refBias = 0.5){ # create RangedSummarizedExperiment object as input for runMBASED # then runMBASED @@ -101,7 +101,10 @@ SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){ mySample <- SummarizedExperiment( assays = list(lociAllele1Counts = matrix(annotatedData[, RO], ncol = 1, dimnames = list(names(mySNVs), 'mySample')), - lociAllele2Counts = matrix(annotatedData[, AO], ncol = 1, dimnames = list(names(mySNVs), 'mySample')) + lociAllele2Counts = matrix(annotatedData[, AO], ncol = 1, dimnames = list(names(mySNVs), 'mySample')), + lociAllele1CountsNoASEProbs=matrix(annotatedData$refBias, + ncol=1, dimnames=list(names(mySNVs), 'mySample')) + ), rowRanges=mySNVs) @@ -118,13 +121,18 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){ AO1 <- paste(genotype1, "AO", sep = "_") RO2 <- paste(genotype2, "RO", sep = "_") AO2 <- paste(genotype2, "AO", sep = "_") + RAB1 <- paste(genotype1, "refBias", sep = "_") + RAB2 <- paste(genotype2, "refBias", sep = "_") mySample <- SummarizedExperiment( assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2, dimnames = list(names(mySNVs), c(genotype1, genotype2))), lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2, - dimnames = list(names(mySNVs), c(genotype1, genotype2)))), + dimnames = list(names(mySNVs), c(genotype1, genotype2))), + + lociAllele1CountsNoASEProbs = matrix(c(annotatedData[, RAB1], annotatedData[, RAB2]), + ncol=2, dimnames=list(names(mySNVs), c(genotype1, genotype2)))), rowRanges=mySNVs) MBASEDOutput <- runMBASED( @@ -138,28 +146,6 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){ } ``` -```{r Phasing Data} -gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA") - -# Data to use (basicallly, VCF data pre-filtered for quality) -load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") - -inputVCF <- F1.young.GQ.filtered -inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ] -inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ] - -# Switch the Ref and Alt alleles, and change the counts accordingly -colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF") -colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO", - "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO") - -phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol) - -attach(phasedData.unordered) -phasedData <- phasedData.unordered[order(CHROM, POS), ] -detach(phasedData.unordered) -``` - ```{r Filtering Functions} CoverageFilter <- function(vcf) { @@ -180,35 +166,97 @@ CoverageFilter <- function(vcf) { RangeFilter <- function(vcf) { # Discard SNVs that are within 10 bp of another called SNV. # This is necessary because if one read spans both SNVs, then our counts are not independent. - # There is probably a better way to do this... + # There may be a more elegant way to do this - SNV.ranges <- GRanges(seqnames = Rle(SNPdata$CHROM), - ranges = IRanges(start = SNPdata$POS - 5, end = SNPdata$POS + 5), - CHROM = SNPdata$CHROM, - POS = SNPdata$POS) + SNV.ranges <- GRanges(seqnames = Rle(vcf$CHROM), + ranges = IRanges(start = vcf$POS - 5, end = vcf$POS + 5), + CHROM = vcf$CHROM, + POS = vcf$POS) overlappedGenes <- mergeByOverlaps(SNV.ranges, SNV.ranges) - # Get only the 10bp overlaps that were not with itself - actualOverlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),] + # Get only the 10bp overlaps that were not self-overlaps + overlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),] - a <- as.data.frame(actualOverlappedGenes)[, c(8, 9, 17, 18)] + # Trim to just the relevant columns, CHROM and POS + overlappedGenes <- as.data.frame(overlappedGenes)[, c(8, 9, 17, 18)] + overlappedGenes <- mutate(overlappedGenes, SNV = paste(CHROM, POS), SNV_match = paste(CHROM.1, POS.1)) # We don't want to get rid of all the SNVs though. We want to keep as many SNVs as possible that aren't overlapping. - # idea for tmrw morning: combine chrom and pos. i <- 1 - while(i <= nrow(a)) { - if (a$POS[i] %in% a$POS.1) { - a <- a[-i, ] + while(i <= nrow(overlappedGenes)) { + if (overlappedGenes$SNV[i] %in% overlappedGenes$SNV_match) { + overlappedGenes <- overlappedGenes[-i, ] } else { i <- i + 1 } } - rangeFilteredData <- anti_join(SNPdata, as.data.frame(actualOverlappedGenes), by = c("CHROM", "POS")) + rangeFilteredData <- anti_join(vcf, as.data.frame(overlappedGenes), by = c("CHROM", "POS")) +} +``` + +From Paper: We assumed that fref,SNV (0.5) is constant across all SNVs within a sample (global reference bias), and we estimated this value as +`Nref/Ntotal` +which is the ratio of all reference reads to total reads in the sample, after excluding the loci with the top 5% of read counts (trimmed for robustness) + +```{r Estimate Overdispersion and Reference Allele Bias} +# If our data presents any, we should include these metrics when creating mySNVs. +# Use the additional functions in MBASED + +# Reference Allele Bias is sample-specific and should be recalculated for each sample. +CalcReferenceBias <- function(vcf, genotype) { + RO <- paste(genotype, "RO", sep = "_") + DP <- paste(genotype, "DP", sep = "_") + + cutoff <- 0.95 * nrow(vcf) + orderedRO <- sort(vcf[, RO]) + orderedDP <- sort(vcf[, DP]) + + Nref <- sum(head(orderedRO, cutoff)) + Ntotal <- sum(head(orderedDP, cutoff)) + + globalReferenceBias <- Nref / Ntotal + + return(globalReferenceBias) +} + +# TODO: Figure out how to calculate dispersion +CalcDispersion <- function(vcf, genotype) { + return(0) } ``` +```{r Phasing Data} +gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA") + +# Data to use (basically, VCF data pre-filtered for quality) +load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") + +inputVCF <- F1.young.GQ.filtered + +# Calculate the reference bias for each sample separately. +refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107 +refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 + +inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ] +inputVCF.Ae$F1_414_refBias <- refBias414 +inputVCF.Ae$F1_415_refBias <- refBias415 +inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ] +inputVCF.Ol$F1_414_refBias <- 1 - refBias414 +inputVCF.Ol$F1_415_refBias <- 1 - refBias415 + +# Switch the Ref and Alt alleles, and change the counts accordingly +colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF") +colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO", + "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO") + +phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol) + +attach(phasedData.unordered) +phasedData <- phasedData.unordered[order(CHROM, POS), ] +detach(phasedData.unordered) +``` ```{r Annotating and Filtering Data} annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA) @@ -230,14 +278,6 @@ dim(annotatedData) # NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. ``` -```{r Estimate Overdispersion and Reference Allele Bias} -# If our data presents any, we should include these metrics when creating mySNVs. -# Use the additional functions in MBASED - -``` - - - ```{r Single Sample Analysis} mySNVs <- GRanges( @@ -249,21 +289,15 @@ mySNVs <- GRanges( names(mySNVs) <- annotatedData$GeneID -MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0) +MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0, refBias = refBias414) save(MBASED.F1.414, file = "MBASED.F1.414.Rdata") -MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0) +MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0, refBias = refBias415) save(MBASED.F1.414, file = "MBASED.F1.415.Rdata") -MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae", numSim = 0) -save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata") -MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol", numSim = 0) -save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata") ``` -In Progress: Two Sample analysis - ```{r Two Sample Analysis} # Estimating two sample run time using smaller datasets -annotatedData.trimmed <- annotatedData[1:500 ,] +annotatedData.trimmed <- annotatedData[1:10000, ] mySNVs.trimmed <- GRanges( seqnames = annotatedData.trimmed$CHROM, @@ -272,18 +306,34 @@ mySNVs.trimmed <- GRanges( allele1 = annotatedData.trimmed$REF, allele2 = annotatedData.trimmed$ALT) -time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415")) +time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415", numSim = 10)) + +n = 1 +while(n <= 10) { + timeTaken <- rbind(timeTaken, c(system.time(TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415", numSim = n)), 10000, n)) + n = n * 10 + print(paste("Next: N =", n)) +} -time100.10 -# 100: 9 seconds +t <- as.data.frame(timeTaken) + +p <- ggplot(data = t[t$SNVs == 100, ], aes(y = sys.self, x= log(numSims))) + + geom_point(aes(color = "100")) + + geom_point(data = t[t$SNVs == 1000, ], aes(color = "1000")) + + geom_point(data = t[t$SNVs ==5000, ], aes(color ="5000")) #+ +# geom_point(data = t[t$SNVs == 10000, ], aes(color = "10000")) + +p2 <- ggplot(data = t[t$SNVs == 100, ], aes(y = elapsed, x= log(numSims))) + + geom_point(aes(color = "100")) + + geom_point(data = t[t$SNVs == 1000, ], aes(color = "1000")) + + geom_point(data = t[t$SNVs ==5000, ], aes(color ="5000")) + + geom_point(data = t[t$SNVs == 10000, ], aes(color = "10000")) ``` ```{r Analysis and Summary} load("MBASED.F1.414.Rdata") load("MBASED.F1.415.Rdata") -load("MBASED.Ae.Rdata") -load("MBASED.Ol.Rdata") sig.F1.414 <- ExtractASE(MBASED.F1.414) dim(sig.F1.414[[1]]) #1946 SNVs found to be in ASE @@ -292,10 +342,4 @@ summary(sig.F1.414[[2]]) sig.F1.415 <- ExtractASE(MBASED.F1.415) dim(sig.F1.415[[1]]) #2164 SNVs found to be in ASE head(sig.F1.415[[2]]$GeneID) - -sig.Ae <- ExtractASE(MBASED.Ae) -dim(sig.Ae[[1]]) #11995 SNVs found to be in ASE - -sig.Ol <- ExtractASE(MBASED.Ol) -dim(sig.Ol[[1]]) #11990 SNVs found to be in ASE ``` diff --git a/F1/overdispersion_practice.Rmd b/F1/overdispersion_practice.Rmd new file mode 100644 index 0000000..fde72e5 --- /dev/null +++ b/F1/overdispersion_practice.Rmd @@ -0,0 +1,174 @@ +--- +title: "R Notebook" +output: html_notebook +--- +```{r} +library(MBASED) +library(tidyverse) +library(VGAM) +``` + +```{r} +load("annotatedData.Rdata") +load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") +F1.414 <- F1.young.GQ.filtered +F1.414 <- mutate(F1.414, MAF = F1_414_RO / F1_414_DP) +``` + +```{r} +# log likelihood function +ll <- function(alpha, beta) { + x <- F1.414$F1_414_RO + total <- F1.414$F1_414_DP + -sum(VGAM::dbetabinom.ab(x, total, alpha, beta, log = TRUE)) +} + +m <- mle(ll, start = list(alpha = 1, beta = 1), method = "L-BFGS-B", lower = c(0.0001, 0.0001)) + +ab <- coef(m) + +alpha0 <- ab[1] +beta0 <- ab[2] + +F1.414 %>% + ggplot() + + geom_histogram(aes(MAF, y = ..density..), binwidth = .02) + + stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red", + size = 1) + + xlab("Major Allele Frequency (MAF)") +``` + +```{r} +sim <- data.frame(a = c(alpha0, alpha0 + 10, alpha0 + 100), + b = c(beta0, beta0 + 8, beta0 + 10)) %>% + group_by(a, b) %>% + do(data_frame(x = seq(0, 1, .001), y = dbeta(x, .$a, .$b))) %>% + mutate(Parameters = paste0("\u03B1 = ", a, ", \u03B2 = ", b)) %>% + ungroup %>% + mutate(Parameters = factor(Parameters, levels = unique(Parameters))) + +sim %>% ggplot(aes(x, y, color = Parameters)) + geom_line() + + xlim(0, 1) + ylab("Density of beta") + + xlab("Major Allele Frequency") +``` +BETA BINOMIAL REGRESSION + +```{r} +prior_mu = alpha0 / (alpha0 + beta0) + +eb <- F1.414 %>% + mutate(eb_estimate = (F1_414_RO + alpha0) / (F1_414_DP + alpha0 + beta0)) %>% + mutate(alpha1 = F1_414_RO + alpha0, + beta1 = F1_414_AO + beta0) %>% + arrange(desc(eb_estimate)) + +ggplot(eb, aes(MAF, eb_estimate, color = F1_414_DP)) + + geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) + + geom_point() + + geom_abline(color = "red") + + scale_colour_gradient(trans = "log", breaks = 10 ^ (1:5)) + + xlab("Major Allele Frequency") + + ylab("Empirical Bayes MAF") + +``` + + +```{r} +eb %>% + ggplot(aes(F1_414_DP, MAF)) + + geom_point() + + geom_smooth(method = "lm", se = FALSE) + + stat_summary(fun.y = var, geom = "point", color = "blue") + + scale_x_log10() + +eb %>% + ggplot(aes(F1_414_DP, MAF)) + + stat_summary(fun.y = var, geom = "point") + + scale_x_log10() +``` + +```{r} +eb %>% + ggplot(aes(x = F1_414_DP, y = eb_estimate)) + + geom_point() + + scale_x_log10() + + geom_hline(color = "red", yintercept = prior_mu) + + #facet_wrap(~type) + + #ylab("average") + + geom_smooth(method = "lm") + +eb %>% + ggplot(aes(x = F1_414_DP, y = MAF)) + + geom_point() + + scale_x_log10() + + geom_hline(color = "red", yintercept = prior_mu) + + #facet_wrap(~type) + + #ylab("average") + + geom_smooth(method = "lm") +``` +BETA BINOMIAL REGRESSION + + +```{r} +library(gamlss) +library(broom) + +fit <- gamlss(cbind(F1_414_RO, F1_414_DP - F1_414_RO) ~ log(F1_414_DP), +data = eb, +family = BB(mu.link = "identity")) + +fit <- gamlss(cbind(F1_414_RO, F1_414_AO) ~ log(F1_414_DP), + data = eb, + family = BB(mu.link = "identity")) + +td <- tidy(fit) +td + +mu_0 <- td$estimate[1] +mu_AB <- td$estimate[2] +sigma <- exp(td$estimate[3]) + +crossing(x = seq(0.1, .9, .01), AB = c(1, 10, 100, 1000, 10000)) %>% + mutate(density = dbeta(x, (mu_0 + mu_AB * log(AB)) / sigma, + (1 - (mu_0 + mu_AB * log(AB))) / sigma)) %>% + mutate(AB = factor(AB)) %>% + ggplot(aes(x, density, color = AB, group = AB)) + + geom_line() + + xlab("Batting average") + + ylab("Prior density") +``` + + + +```{r Find Rho} + +muRho <- MBASED:::getMuRho(alpha0, beta0) + +# log likelihood function +ll <- function(rho) { + x <- F1.414$F1_414_RO + total <- F1.414$F1_414_DP + mu <- F1.414$F1_414_refBias + -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE)) +} + +m <- mle(ll, start = list(rho = 0), method = "L-BFGS-B", lower = c(0.000001), upper = c(0.999)) + +ab <- coef(m) + +alpha0 <- ab[1] +beta0 <- ab[2] + +F1.414 %>% + ggplot() + + geom_histogram(aes(average, y = ..density..), binwidth = .02) + + stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red", + size = 1) + +``` +Calculate rho at one locus + +```{r} + +``` + From a1abde3b729f42a07ecc3ca632c41915ee8638bc Mon Sep 17 00:00:00 2001 From: LynnLy Date: Tue, 13 Feb 2018 19:36:29 -0800 Subject: [PATCH 4/5] Finished adding function to calculate dispersion --- F1/MBASED_F1.rmd | 120 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 28 deletions(-) diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd index 85ac82e..398fd0c 100644 --- a/F1/MBASED_F1.rmd +++ b/F1/MBASED_F1.rmd @@ -221,24 +221,107 @@ CalcReferenceBias <- function(vcf, genotype) { return(globalReferenceBias) } -# TODO: Figure out how to calculate dispersion -CalcDispersion <- function(vcf, genotype) { - return(0) -} +# TODO: Have someone double check my math +CalcDispersion <- function(vcf, genotype, refBias) { + RO <- paste(genotype, "RO", sep = "_") + DP <- paste(genotype, "DP", sep = "_") + AO <- paste(genotype, "AO", sep = "_") + + # To eliminate potential artifacts that may affect the model fit, we require that the SNV be detected in at least 2 samples, with at least one sample presenting the reference as major and one presenting the alternate as major + # I'm not sure if this should apply to our data. We already pooled our samples; should we depool? + vcf <- vcf %>% + mutate(F1_414_ROFreq = F1_414_RO / F1_414_DP > 0.5, + F1_415_ROFreq = F1_415_RO / F1_415_DP > 0.5) %>% + filter(F1_414_ROFreq != F1_415_ROFreq) + + # Log Likelihood Function to help us maximize rho (dispersion) using mle() + ll <- function(rho) { + x <- vcf[, RO] + total <- vcf[, DP] + mu <- refBias + -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE)) + } + + # Maximum likelihood function. Goal: Find ideal rho, given the data and our previously measured reference bias + m <- mle(ll, start = list(rho = 0.000001), method = "L-BFGS-B", lower = 0.000001, upper = 0.10 ) + rho1 <- coef(m) + print(paste0("The initial estimate for rho is ", rho1)) + + # We must estimate the likelihood for the observed data under BetaBin(n = RO, mu = ref bias, rho = dispersion) + # rho1 is the best p that maximizes the joint likelihood over all SNVs, but right now is not accurate due to outliers + + # Calculate the model based p-value at each SNV + vcf$p <- pbetabinom(q = vcf[, RO], size = vcf[, DP], prob = refBias, rho = rho1) + vcf$p2 <- pbetabinom(q = vcf[, AO], size = vcf[, DP], prob = 1 - refBias, rho = rho1) + + # Remove all SNVs with a BH-adjusted P < 0.05; these are outliers. There should not be too many, less than 1% of our total data. + # NOTE: Since we are looking at so many SNVs, it seems that no multiple testing method will give P < 0.9 + # I am testing just removing the top 63 + 71 SNVs (0.1% of the data), since they are clearly in ASE and inflating the dispersion + vcf$adjustedP <- p.adjust(vcf$p, method = "none") + vcf$adjustedP2 <- p.adjust(vcf$p2, method = "none") + + print(paste0("Removing ", nrow(vcf[vcf$adjustedP < 0.01 | vcf$adjustedP2 < 0.01,]), " rows")) + + # Refit the model for our final estimate of rho + + ll <- function(rho) { + x <- vcf[vcf$adjustedP > 0.01 & vcf$adjustedP2 > 0.01, RO] + total <- vcf[vcf$adjustedP > 0.01 & vcf$adjustedP2 > 0.01, DP] + mu <- refBias + -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE)) + } + + m <- mle(ll, start = list(rho = 0.000001), method = "L-BFGS-B", lower = 0.000001, upper = 0.10 ) + + rhoFinal <- coef(m) + print(paste0("The final rho is ", rhoFinal)) + return(rhoFinal) + + } + ``` -```{r Phasing Data} +```{r Annotating and Filtering Data} gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA") +#gff.mRNA <- read.table("gff.mRNA") # Data to use (basically, VCF data pre-filtered for quality) load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata") +#load("F1.young.GQ.filtered.Rdata") + +annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA) + +# Remove SNVs with no associated genes +annotatedData <- filter(annotatedData, !is.na(GeneID)) +dim(annotatedData) + +# The following filters are applied to avoid spurious ASE calls. + +# Require at least 5 reads and at least 10% of all reads support each allele +annotatedData <- CoverageFilter(annotatedData) +dim(annotatedData) + +# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated) +annotatedData <- RangeFilter(annotatedData) +dim(annotatedData) + +# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. +``` -inputVCF <- F1.young.GQ.filtered +```{r Phasing Data} +inputVCF <- annotatedData # Calculate the reference bias for each sample separately. -refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107 -refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 +refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107 unfiltered, 0.509284 filtered +refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 unfiltered, 0.513306 filtered + +dispersion414 <- CalcDispersion(inputVCF, "F1_414", refBias414) +dispersion415 <- CalcDispersion(inputVCF, "F1_415", refBias415) + +inputVCF$disp414 <- dispersion414 +inputVCF$disp415 <- dispersion415 +# Now that necessary parameters have been calculated, phase the data so that all REF is from Da_Ae inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ] inputVCF.Ae$F1_414_refBias <- refBias414 inputVCF.Ae$F1_415_refBias <- refBias415 @@ -256,27 +339,8 @@ phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol) attach(phasedData.unordered) phasedData <- phasedData.unordered[order(CHROM, POS), ] detach(phasedData.unordered) -``` - -```{r Annotating and Filtering Data} -annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA) - -# Remove SNVs with no associated genes -annotatedData <- filter(annotatedData, !is.na(GeneID)) -dim(annotatedData) - -# The following filters are applied to avoid spurious ASE calls. - -# Require at least 5 reads and at least 10% of all reads support each allele -annotatedData <- CoverageFilter(annotatedData) -dim(annotatedData) - -# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated) -annotatedData <- RangeFilter(annotatedData) -dim(annotatedData) - -# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. +save(phasedData, file = "phasedData.Rdata") ``` ```{r Single Sample Analysis} From 64fd2179243e19ef887b812f52aecd82a252b123 Mon Sep 17 00:00:00 2001 From: LynnLy Date: Wed, 14 Feb 2018 11:27:32 -0800 Subject: [PATCH 5/5] Double check dispersion --- F1/MBASED_F1.rmd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd index 398fd0c..5ecfd66 100644 --- a/F1/MBASED_F1.rmd +++ b/F1/MBASED_F1.rmd @@ -19,6 +19,7 @@ Outputs: major allele frequencies, frequency differences, list of genes displayi ```{r setup, include=FALSE} library(MBASED) library(tidyverse) +library(VGAM) ``` ```{r AnnotateSNPs Function} @@ -318,8 +319,8 @@ refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 unfiltered, 0.51 dispersion414 <- CalcDispersion(inputVCF, "F1_414", refBias414) dispersion415 <- CalcDispersion(inputVCF, "F1_415", refBias415) -inputVCF$disp414 <- dispersion414 -inputVCF$disp415 <- dispersion415 +inputVCF$F1_414_disp <- dispersion414 +inputVCF$F1_415_disp <- dispersion415 # Now that necessary parameters have been calculated, phase the data so that all REF is from Da_Ae inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]