From f843bceabd8e662de720abc4122b0997854d4258 Mon Sep 17 00:00:00 2001
From: LynnLy <lynnly1200@gmail.com>
Date: Thu, 8 Feb 2018 19:08:46 -0800
Subject: [PATCH 1/5] Measuring system time

---
 F1/MBASED_F1.rmd | 65 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd
index 2297950..593d6b1 100644
--- a/F1/MBASED_F1.rmd
+++ b/F1/MBASED_F1.rmd
@@ -4,17 +4,17 @@ author: Lynn Ly
 output: html_document
 ---
 
-Author comment
-File description comment, including purpose of program, inputs, and outputs
-source() and library() statements
-Function definitions
-Executed statements, if applicable (e.g., print, plot)
-
-Purpose: This uses the MBASED package to analyze allele specific expression. 
-  Single sample analysis: Find genes that display allelic imbalance ie. not expressed 0.5/0.5. Default threshold: 0.7 
-  Two sample analysis: Find genes that display different ASE ratios between two samples. Doesn't matter what the actual ratios are, as long as they differ by over 0.2
-Inputs: Pre-filtered VCF file, simplified gff file with just ranges and feature name
-Outputs: major allele frequencies, frequency differences, list of genes displaying ASE
+Author comment  
+File description comment, including purpose of program, inputs, and outputs  
+source() and library() statements  
+Function definitions  
+Executed statements, if applicable (e.g., print, plot)  
+
+Purpose: This uses the MBASED package to analyze allele specific expression.  
+  Single sample analysis: Find genes that display allelic imbalance ie. not expressed 0.5/0.5. Default threshold: 0.7   
+  Two sample analysis: Find genes that display different ASE ratios between two samples with the SAME HAPLOTYPE. Doesn't matter what the actual ratios are, as long as they differ by over 0.2  
+Inputs: Pre-filtered VCF file, simplified gff file with just ranges and feature name  
+Outputs: major allele frequencies, frequency differences, list of genes displaying ASE  
 
 ```{r setup, include=FALSE}
 library(MBASED)
@@ -64,7 +64,7 @@ SummarizeASEResults_1s <- function(MBASEDOutput) {
     pValueHeterogeneity = assays(MBASEDOutput)$pValueHeterogeneity[,1])
   
   lociOutputGR <- rowRanges(metadata(MBASEDOutput)$locusSpecificResults)
-  lociOutputGR$allele1IsMajor <-  assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor[,1]
+  lociOutputGR$allele1IsMajor <- assays(metadata(MBASEDOutput)$locusSpecificResults)$allele1IsMajor[,1]
   lociOutputGR$MAF <- assays(metadata(MBASEDOutput)$locusSpecificResults)$MAF[,1]
   lociOutputList <- split(lociOutputGR, factor(lociOutputGR$aseID, levels = unique(lociOutputGR$aseID)))
   return(list(geneOutput=geneOutputDF, locusOutput=lociOutputList))
@@ -78,7 +78,9 @@ ExtractASE <- function(MBASEDOutput) {
   
   results <- SummarizeASEResults_1s(MBASEDOutput)
 
-  ASEindexes <- results$geneOutput$pValueASE * 46577 < 0.05 & 
+  adjustedP <- p.adjust(results$geneOutput$pValueASE, method = "BH", n = length(results$geneOutput$pValueASE))
+  
+  ASEindexes <- adjustedP < 0.05 & 
     results$geneOutput$majorAlleleFrequency > 0.7
   
   significantResults <- list(results$geneOutput[ASEindexes, ], 
@@ -121,18 +123,25 @@ MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol")
 save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata")
 ```
 
-```{r Data to use}
-# Change the following line if not using B. napus
+```{r Data Filtering}
 gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA")
 
-# Data to use
+# Data to use (basicallly, VCF data pre-filtered for quality)
 load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
 
 annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA)
   
 # Remove SNVs with no associated genes
 annotatedData <- filter(annotatedData, !is.na(GeneID))
-  
+
+# The following filters are applied to avoid spurious ASE calls.
+
+# Require at least 5 reads and at least 10% of all reads support each allele
+
+# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated)
+
+# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. 
+
 mySNVs <- GRanges(
   seqnames = annotatedData$CHROM,
   ranges = IRanges(start = annotatedData$POS, width = 1),
@@ -143,7 +152,7 @@ mySNVs <- GRanges(
 names(mySNVs) <- annotatedData$GeneID
 ```
 
-In Progress: TwoSample analysis
+In Progress: TwoSample analysis  
 
 ```{r Two Sample Function}
 TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){
@@ -163,16 +172,29 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){
   MBASEDOutput <- runMBASED(
     ASESummarizedExperiment = mySample,
     isPhased = FALSE,
-    numSim = 0,
+    numSim = 10
     #BPPARAM = SerialParam() # Default: No paralellization
   )
   
   return(MBASEDOutput)
-}
+} 
+
 ```
 
 ```{r Run Two Sample}
-MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData, mySNVs, "F1_414", "F1_415")
+# Estimating two sample run time using smaller datasets
+annotatedData.trimmed <- annotatedData[400:600 ,]
+
+mySNVs.trimmed <- GRanges(
+  seqnames = annotatedData.trimmed$CHROM,
+  ranges = IRanges(start = annotatedData.trimmed$POS, width = 1),
+  aseID = as.vector(annotatedData.trimmed$GeneID),
+  allele1 = annotatedData.trimmed$REF,
+  allele2 = annotatedData.trimmed$ALT)
+
+time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415"))
+
+# 100: 9 seconds
 
 ```
 
@@ -197,4 +219,3 @@ dim(sig.Ae[[1]]) #11995 SNVs found to be in ASE
 sig.Ol <- ExtractASE(MBASED.Ol)
 dim(sig.Ol[[1]]) #11990 SNVs found to be in ASE
 ```
-

From 522bb6fcebcf9cf64177093929b7b616ac49f989 Mon Sep 17 00:00:00 2001
From: LynnLy <lynnly1200@gmail.com>
Date: Thu, 8 Feb 2018 23:04:46 -0800
Subject: [PATCH 2/5] Finished phasing, added filtering. To do: Finish
 rangeFilter

---
 F1/MBASED_F1.rmd | 176 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 128 insertions(+), 48 deletions(-)

diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd
index 593d6b1..862817b 100644
--- a/F1/MBASED_F1.rmd
+++ b/F1/MBASED_F1.rmd
@@ -22,6 +22,7 @@ library(tidyverse)
 ```
 
 ```{r AnnotateSNPs Function}
+# Modified from Ruijuan's function in helpler.R
 AnnotateSNPs <- function(SNPdata, gff.mRNA){
   # Combines SNP/SNV loci with gene names
   #
@@ -73,11 +74,10 @@ SummarizeASEResults_1s <- function(MBASEDOutput) {
 ExtractASE <- function(MBASEDOutput) {
   # Extract only desired genes
   # Modify ASEindexes to vary the strictness of selection.
-  # TODO: Implement Benjamini-Hochberg correction (punish p-values based on rank)
-  # Currently using Bonferroni as a crude correction measure (punishes all p-values equally)
   
   results <- SummarizeASEResults_1s(MBASEDOutput)
 
+  # Apply Benjamini-Hochberg (fdr) correction for multiple testing
   adjustedP <- p.adjust(results$geneOutput$pValueASE, method = "BH", n = length(results$geneOutput$pValueASE))
   
   ASEindexes <- adjustedP < 0.05 & 
@@ -89,10 +89,10 @@ ExtractASE <- function(MBASEDOutput) {
 }
 ```
 
-```{r Single Sample Function}
+```{r Single and Two Sample Function}
 # Please change numSim to at least 1,000,000 for final analysis
 
-SingleSample <- function(annotatedData, mySNVs, genotype){
+SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){
   # create RangedSummarizedExperiment object as input for runMBASED
   # then runMBASED
   
@@ -107,41 +107,139 @@ SingleSample <- function(annotatedData, mySNVs, genotype){
   
   MBASEDOutput <- runMBASED(
     ASESummarizedExperiment = mySample,
-    numSim = 1000,
-    isPhased = FALSE) 
+    numSim = numSim,
+    isPhased = TRUE) 
 
   return(MBASEDOutput)
 }
 
-MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414")
-save(MBASED.F1.414, file = "MBASED.F1.414.Rdata")
-MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415")
-save(MBASED.F1.414, file = "MBASED.F1.415.Rdata")
-MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae")
-save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata")
-MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol")
-save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata")
+TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){
+  RO1 <- paste(genotype1, "RO", sep = "_")
+  AO1 <- paste(genotype1, "AO", sep = "_")
+  RO2 <- paste(genotype2, "RO", sep = "_")
+  AO2 <- paste(genotype2, "AO", sep = "_")
+  
+  mySample <- SummarizedExperiment(
+    assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2,
+                                             dimnames = list(names(mySNVs), c(genotype1, genotype2))),
+                                             
+                  lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2,
+                                            dimnames = list(names(mySNVs), c(genotype1, genotype2)))),
+    rowRanges=mySNVs)
+  
+  MBASEDOutput <- runMBASED(
+    ASESummarizedExperiment = mySample,
+    isPhased = TRUE,
+    numSim = numSim
+    #BPPARAM = SerialParam() # Default: No paralellization
+  )
+  
+  return(MBASEDOutput)
+} 
 ```
 
-```{r Data Filtering}
+```{r Phasing Data}
 gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA")
 
 # Data to use (basicallly, VCF data pre-filtered for quality)
 load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
 
-annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA)
+inputVCF <- F1.young.GQ.filtered
+inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]
+inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ]
+
+# Switch the Ref and Alt alleles, and change the counts accordingly
+colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF")
+colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO",
+                                  "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO")
+
+phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol)
+
+attach(phasedData.unordered)
+phasedData <- phasedData.unordered[order(CHROM, POS), ]
+detach(phasedData.unordered)
+```
+
+```{r Filtering Functions}
+
+CoverageFilter <- function(vcf) {
+  # Modified from Ruijuan's function in helpler.R
+  # Require at least 10 reads and at least 10% of all reads support each allele
+  
+  totalReads <- vcf$F1_414_DP
+  alleles <- vcf[, c(19, 23)]
+  vcf.filtered <- vcf[which(apply(alleles / totalReads, 1, min) > .10), ]
+  
+  totalReads <- vcf.filtered$F1_415_DP
+  alleles <- vcf.filtered[, c(20, 24)]
+  vcf.filtered <- vcf.filtered[which(apply(alleles / totalReads, 1, min) > .10), ]
+  
+  return(vcf.filtered) 
+} 
+
+RangeFilter <- function(vcf) {
+  # Discard SNVs that are within 10 bp of another called SNV. 
+  # This is necessary because if one read spans both SNVs, then our counts are not independent. 
+  # There is probably a better way to do this...
+  
+  SNV.ranges <- GRanges(seqnames = Rle(SNPdata$CHROM), 
+                    ranges = IRanges(start = SNPdata$POS - 5, end = SNPdata$POS + 5),
+                    CHROM = SNPdata$CHROM,
+                    POS = SNPdata$POS)
+  
+  overlappedGenes <- mergeByOverlaps(SNV.ranges, SNV.ranges)
+  
+  # Get only the 10bp overlaps that were not with itself
+  actualOverlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),]
+  
+  a <- as.data.frame(actualOverlappedGenes)[, c(8, 9, 17, 18)]
+  
+  # We don't want to get rid of all the SNVs though. We want to keep as many SNVs as possible that aren't overlapping. 
+  # idea for tmrw morning: combine chrom and pos. 
+  i <- 1
+  while(i <= nrow(a)) {
+    if (a$POS[i] %in% a$POS.1) {
+      a <- a[-i, ]
+    } else {
+    i <- i + 1
+    }
+  }
+
+  rangeFilteredData <- anti_join(SNPdata, as.data.frame(actualOverlappedGenes), by = c("CHROM", "POS"))
+}
+```
+
+
+```{r Annotating and Filtering Data}
+annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA)
   
 # Remove SNVs with no associated genes
 annotatedData <- filter(annotatedData, !is.na(GeneID))
+dim(annotatedData)
 
 # The following filters are applied to avoid spurious ASE calls.
 
 # Require at least 5 reads and at least 10% of all reads support each allele
+annotatedData <- CoverageFilter(annotatedData)
+dim(annotatedData)
 
 # Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated)
+annotatedData <- RangeFilter(annotatedData)
+dim(annotatedData)
+
+# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser.
+
+```
+```{r Estimate Overdispersion and Reference Allele Bias}
+# If our data presents any, we should include these metrics when creating mySNVs. 
+# Use the additional functions in MBASED
+
+```
+
 
-# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser. 
 
+
+```{r Single Sample Analysis}
 mySNVs <- GRanges(
   seqnames = annotatedData$CHROM,
   ranges = IRanges(start = annotatedData$POS, width = 1),
@@ -150,40 +248,22 @@ mySNVs <- GRanges(
   allele2 = annotatedData$ALT)
   
 names(mySNVs) <- annotatedData$GeneID
-```
-
-In Progress: TwoSample analysis  
-
-```{r Two Sample Function}
-TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2){
-  RO1 <- paste(genotype1, "RO", sep = "_")
-  AO1 <- paste(genotype1, "AO", sep = "_")
-  RO2 <- paste(genotype2, "RO", sep = "_")
-  AO2 <- paste(genotype2, "AO", sep = "_")
-  
-  mySample <- SummarizedExperiment(
-    assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2,
-                                             dimnames = list(names(mySNVs), c(genotype1, genotype2))),
-                                             
-                  lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2,
-                                            dimnames = list(names(mySNVs), c(genotype1, genotype2)))),
-    rowRanges=mySNVs)
-  
-  MBASEDOutput <- runMBASED(
-    ASESummarizedExperiment = mySample,
-    isPhased = FALSE,
-    numSim = 10
-    #BPPARAM = SerialParam() # Default: No paralellization
-  )
-  
-  return(MBASEDOutput)
-} 
 
+MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0)
+save(MBASED.F1.414, file = "MBASED.F1.414.Rdata")
+MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0)
+save(MBASED.F1.414, file = "MBASED.F1.415.Rdata")
+MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae", numSim = 0)
+save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata")
+MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol", numSim = 0)
+save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata")
 ```
 
-```{r Run Two Sample}
+In Progress: Two Sample analysis  
+
+```{r Two Sample Analysis}
 # Estimating two sample run time using smaller datasets
-annotatedData.trimmed <- annotatedData[400:600 ,]
+annotatedData.trimmed <- annotatedData[1:500 ,]
 
 mySNVs.trimmed <- GRanges(
   seqnames = annotatedData.trimmed$CHROM,
@@ -194,6 +274,7 @@ mySNVs.trimmed <- GRanges(
 
 time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415"))
 
+time100.10
 # 100: 9 seconds
 
 ```
@@ -208,7 +289,6 @@ sig.F1.414 <- ExtractASE(MBASED.F1.414)
 dim(sig.F1.414[[1]]) #1946 SNVs found to be in ASE
 summary(sig.F1.414[[2]])
 
-
 sig.F1.415 <- ExtractASE(MBASED.F1.415)
 dim(sig.F1.415[[1]]) #2164 SNVs found to be in ASE
 head(sig.F1.415[[2]]$GeneID)

From 25ee524c407c0f47e670342f37c97cea4bc69631 Mon Sep 17 00:00:00 2001
From: LynnLy <lynnly1200@gmail.com>
Date: Tue, 13 Feb 2018 13:43:07 -0800
Subject: [PATCH 3/5] Working on dispersion practice

---
 F1/MBASED_F1.rmd               | 176 ++++++++++++++++++++-------------
 F1/overdispersion_practice.Rmd | 174 ++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+), 66 deletions(-)
 create mode 100644 F1/overdispersion_practice.Rmd

diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd
index 862817b..85ac82e 100644
--- a/F1/MBASED_F1.rmd
+++ b/F1/MBASED_F1.rmd
@@ -92,7 +92,7 @@ ExtractASE <- function(MBASEDOutput) {
 ```{r Single and Two Sample Function}
 # Please change numSim to at least 1,000,000 for final analysis
 
-SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){
+SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0, refBias = 0.5){
   # create RangedSummarizedExperiment object as input for runMBASED
   # then runMBASED
   
@@ -101,7 +101,10 @@ SingleSample <- function(annotatedData, mySNVs, genotype, numSim = 0){
   
   mySample <- SummarizedExperiment(
     assays = list(lociAllele1Counts = matrix(annotatedData[, RO], ncol = 1, dimnames = list(names(mySNVs), 'mySample')),
-                lociAllele2Counts = matrix(annotatedData[, AO], ncol = 1,  dimnames = list(names(mySNVs), 'mySample'))
+                lociAllele2Counts = matrix(annotatedData[, AO], ncol = 1,  dimnames = list(names(mySNVs), 'mySample')),
+                lociAllele1CountsNoASEProbs=matrix(annotatedData$refBias, 
+                                                   ncol=1, dimnames=list(names(mySNVs), 'mySample'))
+
                 ),
     rowRanges=mySNVs)
   
@@ -118,13 +121,18 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){
   AO1 <- paste(genotype1, "AO", sep = "_")
   RO2 <- paste(genotype2, "RO", sep = "_")
   AO2 <- paste(genotype2, "AO", sep = "_")
+  RAB1 <- paste(genotype1, "refBias", sep = "_")
+  RAB2 <- paste(genotype2, "refBias", sep = "_")
   
   mySample <- SummarizedExperiment(
     assays = list(lociAllele1Counts = matrix(c(annotatedData[, RO1], annotatedData[, RO2]), ncol = 2,
                                              dimnames = list(names(mySNVs), c(genotype1, genotype2))),
                                              
                   lociAllele2Counts = matrix(c(annotatedData[, AO1], annotatedData[, AO2]), ncol = 2,
-                                            dimnames = list(names(mySNVs), c(genotype1, genotype2)))),
+                                            dimnames = list(names(mySNVs), c(genotype1, genotype2))),
+    
+                  lociAllele1CountsNoASEProbs = matrix(c(annotatedData[, RAB1], annotatedData[, RAB2]),
+                                                   ncol=2, dimnames=list(names(mySNVs), c(genotype1, genotype2)))),
     rowRanges=mySNVs)
   
   MBASEDOutput <- runMBASED(
@@ -138,28 +146,6 @@ TwoSample <- function(annotatedData, mySNVs, genotype1, genotype2, numSim = 0){
 } 
 ```
 
-```{r Phasing Data}
-gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA")
-
-# Data to use (basicallly, VCF data pre-filtered for quality)
-load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
-
-inputVCF <- F1.young.GQ.filtered
-inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]
-inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ]
-
-# Switch the Ref and Alt alleles, and change the counts accordingly
-colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF")
-colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO",
-                                  "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO")
-
-phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol)
-
-attach(phasedData.unordered)
-phasedData <- phasedData.unordered[order(CHROM, POS), ]
-detach(phasedData.unordered)
-```
-
 ```{r Filtering Functions}
 
 CoverageFilter <- function(vcf) {
@@ -180,35 +166,97 @@ CoverageFilter <- function(vcf) {
 RangeFilter <- function(vcf) {
   # Discard SNVs that are within 10 bp of another called SNV. 
   # This is necessary because if one read spans both SNVs, then our counts are not independent. 
-  # There is probably a better way to do this...
+  # There may be a more elegant way to do this
   
-  SNV.ranges <- GRanges(seqnames = Rle(SNPdata$CHROM), 
-                    ranges = IRanges(start = SNPdata$POS - 5, end = SNPdata$POS + 5),
-                    CHROM = SNPdata$CHROM,
-                    POS = SNPdata$POS)
+  SNV.ranges <- GRanges(seqnames = Rle(vcf$CHROM), 
+                    ranges = IRanges(start = vcf$POS - 5, end = vcf$POS + 5),
+                    CHROM = vcf$CHROM,
+                    POS = vcf$POS)
   
   overlappedGenes <- mergeByOverlaps(SNV.ranges, SNV.ranges)
   
-  # Get only the 10bp overlaps that were not with itself
-  actualOverlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),]
+  # Get only the 10bp overlaps that were not self-overlaps
+  overlappedGenes <- overlappedGenes[which(overlappedGenes$POS != overlappedGenes$POS.1),]
   
-  a <- as.data.frame(actualOverlappedGenes)[, c(8, 9, 17, 18)]
+  # Trim to just the relevant columns, CHROM and POS
+  overlappedGenes <- as.data.frame(overlappedGenes)[, c(8, 9, 17, 18)]
+  overlappedGenes <- mutate(overlappedGenes, SNV = paste(CHROM, POS), SNV_match = paste(CHROM.1, POS.1))
   
   # We don't want to get rid of all the SNVs though. We want to keep as many SNVs as possible that aren't overlapping. 
-  # idea for tmrw morning: combine chrom and pos. 
   i <- 1
-  while(i <= nrow(a)) {
-    if (a$POS[i] %in% a$POS.1) {
-      a <- a[-i, ]
+  while(i <= nrow(overlappedGenes)) {
+    if (overlappedGenes$SNV[i] %in% overlappedGenes$SNV_match) {
+      overlappedGenes <- overlappedGenes[-i, ]
     } else {
     i <- i + 1
     }
   }
 
-  rangeFilteredData <- anti_join(SNPdata, as.data.frame(actualOverlappedGenes), by = c("CHROM", "POS"))
+  rangeFilteredData <- anti_join(vcf, as.data.frame(overlappedGenes), by = c("CHROM", "POS"))
+}
+```
+
+From Paper: We assumed that fref,SNV (0.5) is constant across all SNVs within a sample (global reference bias), and we estimated this value as  
+`Nref/Ntotal`  
+which is the ratio of all reference reads to total reads in the sample, after excluding the loci with the top 5% of read counts (trimmed for robustness)
+
+```{r Estimate Overdispersion and Reference Allele Bias}
+# If our data presents any, we should include these metrics when creating mySNVs. 
+# Use the additional functions in MBASED
+
+# Reference Allele Bias is sample-specific and should be recalculated for each sample. 
+CalcReferenceBias <- function(vcf, genotype) {
+  RO <- paste(genotype, "RO", sep = "_")
+  DP <- paste(genotype, "DP", sep = "_")
+  
+  cutoff <- 0.95 * nrow(vcf)
+  orderedRO <- sort(vcf[, RO])
+  orderedDP <- sort(vcf[, DP])
+  
+  Nref <- sum(head(orderedRO, cutoff))
+  Ntotal <- sum(head(orderedDP, cutoff))
+  
+  globalReferenceBias <- Nref / Ntotal
+  
+  return(globalReferenceBias)
+}
+
+# TODO: Figure out how to calculate dispersion
+CalcDispersion <- function(vcf, genotype) {
+  return(0)
 }
 ```
 
+```{r Phasing Data}
+gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA")
+
+# Data to use (basically, VCF data pre-filtered for quality)
+load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
+
+inputVCF <- F1.young.GQ.filtered
+
+# Calculate the reference bias for each sample separately. 
+refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107
+refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294
+
+inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]
+inputVCF.Ae$F1_414_refBias <- refBias414
+inputVCF.Ae$F1_415_refBias <- refBias415
+inputVCF.Ol <- inputVCF[inputVCF$Ol_GT == 1, ]
+inputVCF.Ol$F1_414_refBias <- 1 - refBias414
+inputVCF.Ol$F1_415_refBias <- 1 - refBias415
+
+# Switch the Ref and Alt alleles, and change the counts accordingly
+colnames(inputVCF.Ol)[3:4] <- c("ALT", "REF")
+colnames(inputVCF.Ol)[17:24] <- c("Ae_AO", "Ol_AO", "F1_414_AO", "F1_415_AO",
+                                  "Ae_RO", "Ol_RO", "F1_414_RO", "F1_415_RO")
+
+phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol)
+
+attach(phasedData.unordered)
+phasedData <- phasedData.unordered[order(CHROM, POS), ]
+detach(phasedData.unordered)
+```
 
 ```{r Annotating and Filtering Data}
 annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA)
@@ -230,14 +278,6 @@ dim(annotatedData)
 # NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser.
 
 ```
-```{r Estimate Overdispersion and Reference Allele Bias}
-# If our data presents any, we should include these metrics when creating mySNVs. 
-# Use the additional functions in MBASED
-
-```
-
-
-
 
 ```{r Single Sample Analysis}
 mySNVs <- GRanges(
@@ -249,21 +289,15 @@ mySNVs <- GRanges(
   
 names(mySNVs) <- annotatedData$GeneID
 
-MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0)
+MBASED.F1.414 <- SingleSample(annotatedData, mySNVs, genotype = "F1_414", numSim = 0, refBias = refBias414)
 save(MBASED.F1.414, file = "MBASED.F1.414.Rdata")
-MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0)
+MBASED.F1.415 <- SingleSample(annotatedData, mySNVs, genotype = "F1_415", numSim = 0, refBias = refBias415)
 save(MBASED.F1.414, file = "MBASED.F1.415.Rdata")
-MBASED.Ae <- SingleSample(annotatedData, mySNVs, genotype = "Ae", numSim = 0)
-save(MBASED.F1.414, file = "MBASED.F1.Ae.Rdata")
-MBASED.Ol <- SingleSample(annotatedData, mySNVs, genotype = "Ol", numSim = 0)
-save(MBASED.F1.414, file = "MBASED.F1.Ol.Rdata")
 ```
 
-In Progress: Two Sample analysis  
-
 ```{r Two Sample Analysis}
 # Estimating two sample run time using smaller datasets
-annotatedData.trimmed <- annotatedData[1:500 ,]
+annotatedData.trimmed <- annotatedData[1:10000, ]
 
 mySNVs.trimmed <- GRanges(
   seqnames = annotatedData.trimmed$CHROM,
@@ -272,18 +306,34 @@ mySNVs.trimmed <- GRanges(
   allele1 = annotatedData.trimmed$REF,
   allele2 = annotatedData.trimmed$ALT)
 
-time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415"))
+time100.10 <- system.time(MBASED.F1.414.vs.F1.415 <- TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415", numSim = 10))
+
+n = 1
+while(n <= 10) {
+  timeTaken <- rbind(timeTaken, c(system.time(TwoSample(annotatedData.trimmed, mySNVs.trimmed, "F1_414", "F1_415", numSim = n)), 10000, n))
+  n = n * 10
+  print(paste("Next: N =", n))
+}
 
-time100.10
-# 100: 9 seconds
+t <- as.data.frame(timeTaken)
+
+p <- ggplot(data = t[t$SNVs == 100, ], aes(y = sys.self, x= log(numSims))) +
+  geom_point(aes(color = "100")) +
+  geom_point(data = t[t$SNVs == 1000, ], aes(color = "1000")) + 
+  geom_point(data = t[t$SNVs ==5000, ], aes(color ="5000")) #+ 
+#  geom_point(data = t[t$SNVs == 10000, ], aes(color = "10000")) 
+
+p2 <- ggplot(data = t[t$SNVs == 100, ], aes(y = elapsed, x= log(numSims))) +
+  geom_point(aes(color = "100")) +
+  geom_point(data = t[t$SNVs == 1000, ], aes(color = "1000")) + 
+  geom_point(data = t[t$SNVs ==5000, ], aes(color ="5000")) + 
+  geom_point(data = t[t$SNVs == 10000, ], aes(color = "10000")) 
 
 ```
 
 ```{r Analysis and Summary}
 load("MBASED.F1.414.Rdata")
 load("MBASED.F1.415.Rdata")
-load("MBASED.Ae.Rdata")
-load("MBASED.Ol.Rdata")
 
 sig.F1.414 <- ExtractASE(MBASED.F1.414)
 dim(sig.F1.414[[1]]) #1946 SNVs found to be in ASE
@@ -292,10 +342,4 @@ summary(sig.F1.414[[2]])
 sig.F1.415 <- ExtractASE(MBASED.F1.415)
 dim(sig.F1.415[[1]]) #2164 SNVs found to be in ASE
 head(sig.F1.415[[2]]$GeneID)
-
-sig.Ae <- ExtractASE(MBASED.Ae)
-dim(sig.Ae[[1]]) #11995 SNVs found to be in ASE
-
-sig.Ol <- ExtractASE(MBASED.Ol)
-dim(sig.Ol[[1]]) #11990 SNVs found to be in ASE
 ```
diff --git a/F1/overdispersion_practice.Rmd b/F1/overdispersion_practice.Rmd
new file mode 100644
index 0000000..fde72e5
--- /dev/null
+++ b/F1/overdispersion_practice.Rmd
@@ -0,0 +1,174 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+```{r}
+library(MBASED)
+library(tidyverse)
+library(VGAM)
+```
+
+```{r}
+load("annotatedData.Rdata")
+load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
+F1.414 <- F1.young.GQ.filtered
+F1.414 <- mutate(F1.414, MAF = F1_414_RO / F1_414_DP)
+```
+
+```{r}
+# log likelihood function
+ll <- function(alpha, beta) {
+  x <- F1.414$F1_414_RO
+  total <- F1.414$F1_414_DP
+  -sum(VGAM::dbetabinom.ab(x, total, alpha, beta, log = TRUE))
+}
+
+m <- mle(ll, start = list(alpha = 1, beta = 1), method = "L-BFGS-B", lower = c(0.0001, 0.0001))
+
+ab <- coef(m)
+
+alpha0 <- ab[1]
+beta0 <- ab[2]
+
+F1.414 %>%
+    ggplot() +
+    geom_histogram(aes(MAF, y = ..density..), binwidth = .02) +
+     stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
+                size = 1) +
+    xlab("Major Allele Frequency (MAF)")
+```
+
+```{r}
+sim <- data.frame(a = c(alpha0, alpha0 + 10, alpha0 + 100),
+                  b = c(beta0, beta0 + 8, beta0 + 10)) %>%
+    group_by(a, b) %>%
+    do(data_frame(x = seq(0, 1, .001), y = dbeta(x, .$a, .$b))) %>%
+    mutate(Parameters = paste0("\u03B1 = ", a, ", \u03B2 = ", b)) %>%
+    ungroup %>%
+    mutate(Parameters = factor(Parameters, levels = unique(Parameters)))
+
+sim %>% ggplot(aes(x, y, color = Parameters)) + geom_line() +
+    xlim(0, 1) + ylab("Density of beta") +
+    xlab("Major Allele Frequency")
+```
+BETA BINOMIAL REGRESSION
+
+```{r}
+prior_mu = alpha0 / (alpha0 + beta0)
+
+eb <- F1.414 %>%
+  mutate(eb_estimate = (F1_414_RO + alpha0) / (F1_414_DP + alpha0 + beta0)) %>%
+  mutate(alpha1 = F1_414_RO + alpha0,
+  beta1 = F1_414_AO + beta0) %>%
+  arrange(desc(eb_estimate))
+
+ggplot(eb, aes(MAF, eb_estimate, color = F1_414_DP)) +
+  geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
+  geom_point() +
+  geom_abline(color = "red") +
+  scale_colour_gradient(trans = "log", breaks = 10 ^ (1:5)) +
+  xlab("Major Allele Frequency") +
+  ylab("Empirical Bayes MAF")
+
+```
+
+
+```{r}
+eb %>%
+  ggplot(aes(F1_414_DP, MAF)) +
+  geom_point() +
+  geom_smooth(method = "lm", se = FALSE) +
+  stat_summary(fun.y = var, geom = "point", color = "blue") +
+  scale_x_log10()
+
+eb %>%
+  ggplot(aes(F1_414_DP, MAF)) +
+  stat_summary(fun.y = var, geom = "point") + 
+  scale_x_log10()
+```
+
+```{r}
+eb %>%
+  ggplot(aes(x = F1_414_DP, y = eb_estimate)) +
+  geom_point() +
+  scale_x_log10() +
+  geom_hline(color = "red", yintercept = prior_mu) +
+  #facet_wrap(~type) +
+  #ylab("average") +
+    geom_smooth(method = "lm")
+
+eb %>%
+  ggplot(aes(x = F1_414_DP, y = MAF)) +
+  geom_point() +
+  scale_x_log10() +
+  geom_hline(color = "red", yintercept = prior_mu) +
+  #facet_wrap(~type) +
+  #ylab("average") +
+    geom_smooth(method = "lm")
+```
+BETA BINOMIAL REGRESSION
+
+
+```{r}
+library(gamlss)
+library(broom)
+
+fit <- gamlss(cbind(F1_414_RO, F1_414_DP - F1_414_RO) ~ log(F1_414_DP),
+data = eb,
+family = BB(mu.link = "identity"))
+
+fit <- gamlss(cbind(F1_414_RO, F1_414_AO) ~ log(F1_414_DP),
+              data =  eb, 
+              family = BB(mu.link = "identity")) 
+
+td <- tidy(fit)
+td
+
+mu_0 <- td$estimate[1]
+mu_AB <- td$estimate[2]
+sigma <- exp(td$estimate[3])
+
+crossing(x = seq(0.1, .9, .01), AB = c(1, 10, 100, 1000, 10000)) %>%
+  mutate(density = dbeta(x, (mu_0 + mu_AB * log(AB)) / sigma,
+                         (1 - (mu_0 + mu_AB * log(AB))) / sigma)) %>%
+  mutate(AB = factor(AB)) %>%
+  ggplot(aes(x, density, color = AB, group = AB)) +
+  geom_line() +
+  xlab("Batting average") +
+  ylab("Prior density")
+```
+
+
+
+```{r Find Rho}
+
+muRho <- MBASED:::getMuRho(alpha0, beta0)
+
+# log likelihood function
+ll <- function(rho) {
+  x <- F1.414$F1_414_RO
+  total <- F1.414$F1_414_DP
+  mu <- F1.414$F1_414_refBias
+  -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE))
+}
+
+m <- mle(ll, start = list(rho = 0), method = "L-BFGS-B", lower = c(0.000001), upper = c(0.999))
+
+ab <- coef(m)
+
+alpha0 <- ab[1]
+beta0 <- ab[2]
+
+F1.414 %>%
+    ggplot() +
+    geom_histogram(aes(average, y = ..density..), binwidth = .02) +
+     stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
+                size = 1)
+
+```
+Calculate rho at one locus
+
+```{r}
+
+```
+

From a1abde3b729f42a07ecc3ca632c41915ee8638bc Mon Sep 17 00:00:00 2001
From: LynnLy <lynnly1200@gmail.com>
Date: Tue, 13 Feb 2018 19:36:29 -0800
Subject: [PATCH 4/5] Finished adding function to calculate dispersion

---
 F1/MBASED_F1.rmd | 120 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 28 deletions(-)

diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd
index 85ac82e..398fd0c 100644
--- a/F1/MBASED_F1.rmd
+++ b/F1/MBASED_F1.rmd
@@ -221,24 +221,107 @@ CalcReferenceBias <- function(vcf, genotype) {
   return(globalReferenceBias)
 }
 
-# TODO: Figure out how to calculate dispersion
-CalcDispersion <- function(vcf, genotype) {
-  return(0)
-}
+# TODO: Have someone double check my math
+CalcDispersion <- function(vcf, genotype, refBias) {
+  RO <- paste(genotype, "RO", sep = "_")
+  DP <- paste(genotype, "DP", sep = "_")
+  AO <- paste(genotype, "AO", sep = "_")
+  
+  # To eliminate potential artifacts that may affect the model fit, we require that the SNV be detected in at least 2 samples, with at least one sample presenting the reference as major and one presenting the alternate as major
+  # I'm not sure if this should apply to our data. We already pooled our samples; should we depool?
+  vcf <- vcf %>%
+    mutate(F1_414_ROFreq = F1_414_RO / F1_414_DP > 0.5,
+           F1_415_ROFreq = F1_415_RO / F1_415_DP > 0.5) %>%
+    filter(F1_414_ROFreq != F1_415_ROFreq)
+  
+  # Log Likelihood Function to help us maximize rho (dispersion) using mle()
+  ll <- function(rho) {
+    x <- vcf[, RO]
+    total <- vcf[, DP]
+    mu <- refBias
+    -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE))
+  }
+  
+  # Maximum likelihood function. Goal: Find ideal rho, given the data and our previously measured reference bias
+  m <- mle(ll, start = list(rho = 0.000001), method = "L-BFGS-B", lower = 0.000001, upper = 0.10 )
+  rho1 <- coef(m)
+  print(paste0("The initial estimate for rho is ", rho1))
+  
+  # We must estimate the likelihood for the observed data under BetaBin(n = RO, mu = ref bias, rho = dispersion)
+  # rho1 is the best p that maximizes the joint likelihood over all SNVs, but right now is not accurate due to outliers
+  
+  # Calculate the model based p-value at each SNV
+  vcf$p <- pbetabinom(q = vcf[, RO], size = vcf[, DP], prob = refBias, rho = rho1)
+  vcf$p2 <- pbetabinom(q = vcf[, AO], size = vcf[, DP], prob = 1 - refBias, rho = rho1)
+  
+  # Remove all SNVs with a BH-adjusted P < 0.05; these are outliers. There should not be too many, less than 1% of our total data.
+  # NOTE: Since we are looking at so many SNVs, it seems that no multiple testing method will give P < 0.9
+  # I am testing just removing the top 63 + 71 SNVs (0.1% of the data), since they are clearly in ASE and inflating the dispersion
+  vcf$adjustedP <- p.adjust(vcf$p, method = "none")
+  vcf$adjustedP2 <- p.adjust(vcf$p2, method = "none")
+  
+  print(paste0("Removing ", nrow(vcf[vcf$adjustedP < 0.01 | vcf$adjustedP2 < 0.01,]), " rows"))
+  
+  # Refit the model for our final estimate of rho
+  
+  ll <- function(rho) {
+    x <- vcf[vcf$adjustedP > 0.01 & vcf$adjustedP2 > 0.01, RO]
+    total <- vcf[vcf$adjustedP > 0.01 & vcf$adjustedP2 > 0.01, DP]
+    mu <- refBias
+    -sum(VGAM::dbetabinom(x, total, mu, rho, log = TRUE))
+    }
+  
+  m <- mle(ll, start = list(rho = 0.000001), method = "L-BFGS-B", lower = 0.000001, upper = 0.10 )
+  
+  rhoFinal <- coef(m)
+  print(paste0("The final rho is ", rhoFinal))
+  return(rhoFinal)
+  
+  }
+
 ```
 
-```{r Phasing Data}
+```{r Annotating and Filtering Data}
 gff.mRNA <- read.table("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/ruijuanli/Reference/B.napus/gff.mRNA")
+#gff.mRNA <- read.table("gff.mRNA")
 
 # Data to use (basically, VCF data pre-filtered for quality)
 load("/Network/Servers/avalanche.plb.ucdavis.edu/Volumes/Mammoth/Users/mizukikadowaki/project/output/F1.young.GQ.filtered.Rdata")
+#load("F1.young.GQ.filtered.Rdata")
+
+annotatedData <- AnnotateSNPs(SNPdata = F1.young.GQ.filtered, gff.mRNA = gff.mRNA)
+  
+# Remove SNVs with no associated genes
+annotatedData <- filter(annotatedData, !is.na(GeneID))
+dim(annotatedData)
+
+# The following filters are applied to avoid spurious ASE calls.
+
+# Require at least 5 reads and at least 10% of all reads support each allele
+annotatedData <- CoverageFilter(annotatedData)
+dim(annotatedData)
+
+# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated)
+annotatedData <- RangeFilter(annotatedData)
+dim(annotatedData)
+
+# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser.
+```
 
-inputVCF <- F1.young.GQ.filtered
+```{r Phasing Data}
+inputVCF <- annotatedData
 
 # Calculate the reference bias for each sample separately. 
-refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107
-refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294
+refBias414 <- CalcReferenceBias(inputVCF, "F1_414") # 0.5107107 unfiltered, 0.509284 filtered
+refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 unfiltered, 0.513306 filtered
+
+dispersion414 <- CalcDispersion(inputVCF, "F1_414", refBias414)
+dispersion415 <- CalcDispersion(inputVCF, "F1_415", refBias415)
+
+inputVCF$disp414 <- dispersion414
+inputVCF$disp415 <- dispersion415
 
+# Now that necessary parameters have been calculated, phase the data so that all REF is from Da_Ae
 inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]
 inputVCF.Ae$F1_414_refBias <- refBias414
 inputVCF.Ae$F1_415_refBias <- refBias415
@@ -256,27 +339,8 @@ phasedData.unordered <- rbind(inputVCF.Ae, inputVCF.Ol)
 attach(phasedData.unordered)
 phasedData <- phasedData.unordered[order(CHROM, POS), ]
 detach(phasedData.unordered)
-```
-
-```{r Annotating and Filtering Data}
-annotatedData <- AnnotateSNPs(SNPdata = phasedData, gff.mRNA = gff.mRNA)
-  
-# Remove SNVs with no associated genes
-annotatedData <- filter(annotatedData, !is.na(GeneID))
-dim(annotatedData)
-
-# The following filters are applied to avoid spurious ASE calls.
-
-# Require at least 5 reads and at least 10% of all reads support each allele
-annotatedData <- CoverageFilter(annotatedData)
-dim(annotatedData)
-
-# Discard all SNVs within 10 bp of another called variant (if one read spans both SNVs, some independence assumptions are violated)
-annotatedData <- RangeFilter(annotatedData)
-dim(annotatedData)
-
-# NOT AVAILABLE FOR B NAPUS. Discard all SNVs in highly repetitive genomic regions, which we defined as regions with sequence identity of > 95% to another genomic region based on selfChain Link track from UCSC genome browser.
 
+save(phasedData, file = "phasedData.Rdata")
 ```
 
 ```{r Single Sample Analysis}

From 64fd2179243e19ef887b812f52aecd82a252b123 Mon Sep 17 00:00:00 2001
From: LynnLy <lynnly1200@gmail.com>
Date: Wed, 14 Feb 2018 11:27:32 -0800
Subject: [PATCH 5/5] Double check dispersion

---
 F1/MBASED_F1.rmd | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/F1/MBASED_F1.rmd b/F1/MBASED_F1.rmd
index 398fd0c..5ecfd66 100644
--- a/F1/MBASED_F1.rmd
+++ b/F1/MBASED_F1.rmd
@@ -19,6 +19,7 @@ Outputs: major allele frequencies, frequency differences, list of genes displayi
 ```{r setup, include=FALSE}
 library(MBASED)
 library(tidyverse)
+library(VGAM)
 ```
 
 ```{r AnnotateSNPs Function}
@@ -318,8 +319,8 @@ refBias415 <- CalcReferenceBias(inputVCF, "F1_415") # 0.5152294 unfiltered, 0.51
 dispersion414 <- CalcDispersion(inputVCF, "F1_414", refBias414)
 dispersion415 <- CalcDispersion(inputVCF, "F1_415", refBias415)
 
-inputVCF$disp414 <- dispersion414
-inputVCF$disp415 <- dispersion415
+inputVCF$F1_414_disp <- dispersion414
+inputVCF$F1_415_disp <- dispersion415
 
 # Now that necessary parameters have been calculated, phase the data so that all REF is from Da_Ae
 inputVCF.Ae <- inputVCF[inputVCF$Ae_GT == 1, ]