-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathASE
56 lines (49 loc) · 3.5 KB
/
ASE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#Generating allele counts
java -jar GenomeAnalysisTK.jar -T ASEReadCounter -R oryCun2.fa -I Samplel_AA.bam -o Samplel_AA.csv -sites heterozygous_coding_map_1_non_repeat.vcf -U ALLOW_N_CIGAR_READS -minDepth 10 --minMappingQuality 20 --minBaseQuality 20 --countOverlapReadsType COUNT_FRAGMENTS_REQUIRE_SAME_BASE
java -jar GenomeAnalysisTK.jar -T ASEReadCounter -R oryCun2.fa -I Samplel_BB.bam -o Samplel_BB.csv -sites heterozygous_coding_map_1_non_repeat.vcf -U ALLOW_N_CIGAR_READS -minDepth 10 --minMappingQuality 20 --minBaseQuality 20 --countOverlapReadsType COUNT_FRAGMENTS_REQUIRE_SAME_BASE
##Rscript
#Calculating effective library size
correction_factors<-read.table("Correction_factor_AA_BB.txt",header=T) #Generated from edgeR
#Here, we sum up the read counts from two parental genomes and normalized based on effective library size generated by edgeR
individual_list<-read.table("Samples_AA.list", stringsAsFactors = F)
for(i in 1:nrow(individual_list))
{
AA<-get(paste(individual_list[i,1],"_AA",sep=""))#Allele count from parental genome AA
BB<-get(paste(individual_list[i,1],"_BB",sep=""))#Allele count from parental genome BB
##Normalize based on library size
AA[,7:9]<-round(AA[,7:9]/correction_factors[which(correction_factors[,1]==as.character(paste(individual_list[i,1],"_AA",sep=""))),2],0)
BB[,7:9]<-round(BB[,7:9]/correction_factors[which(correction_factors[,1]==as.character(paste(individual_list[i,1],"_BB",sep=""))),2],0)
##merging
tmp_merged<-merge(AA,BB,by="variantID")
tmp_merged$refCount.x=as.numeric(tmp_merged$refCount.x)+as.numeric(tmp_merged$refCount.y)
tmp_merged$altCount.x=as.numeric(tmp_merged$altCount.x)+as.numeric(tmp_merged$altCount.y)
tmp_merged$totalCount.x=tmp_merged$totalCount.x+tmp_merged$totalCount.y
tmp_merged$lowMAPQDepth.x=tmp_merged$lowMAPQDepth.x+tmp_merged$lowMAPQDepth.y
tmp_merged$lowBaseQDepth.x=tmp_merged$lowBaseQDepth.x+tmp_merged$lowBaseQDepth.y
tmp_merged$rawDepth.x=tmp_merged$rawDepth.x+tmp_merged$rawDepth.y
tmp_merged$otherBases.x=tmp_merged$otherBases.x+tmp_merged$otherBases.y
tmp_merged$improperPairs.x=tmp_merged$improperPairs.x+tmp_merged$improperPairs.y
tmp_merged=tmp_merged[,c(2:4,1,5:21)]
colnames(tmp_merged)=c(colnames(header),"Exon_Start","Exon_End","gene_exon_id")
assign(paste(as.character(individual_list[i,1]),"_AA_BB",sep=""),tmp_merged)
write.table(get(paste(as.character(individual_list[i,1]),"_AA_BB",sep="")),paste(as.character(individual_list[i,1]),"_AA_BB",sep=""),col.names=T,row.names = F,quote = F,sep="\t")
}
data<-read.table("Sample_AA_BB",header=T)
data<-cbind.data.frame(data[,1],data[,2]-1,data[,2:13],-1,-1,-100,-1)
colnames(data)[c(1:3,15:18)]<-c("Chr","Start","End","PValue","ABS_Imb","MValue","FDR")
cntr=0
for(i in 1:nrow(data))
{
cntr=cntr+1
if(data$totalCount[i]>=10)
{
data$PValue[i]=binom.test((data$refCount[i]),data$totalCount[i],0.5)$p.value
data$ABS_Imb[i]=abs(0.5-data$refCount[i]/(data$totalCount[i]))
data$MValue[i]=log2(data$refCount[i]/(1+data$altCount[i]))
}
print(cntr)
}
data$FDR<-p.adjust(data$PValue,method="fdr")
write.table(data,"Sample_AA_BB_summed_binom.bed",sep="\t",quote=F,row.names=F,col.names=F,append=F)
#Intersecting with annotation
intersectBed -wa -wb -a Sample_AA_BB.out -b merged.flat.gff | awk '{OFS="\t";print $1,$2,$3,$1"_"$2"_"$3,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$22,$23,$27,$28,$29,$30,$31,$32}' | sed 's/gene_id //' | sed 's/;.*num /_/' | sed 's/;.*//' | sort -k4,4 -u >Sample_AA_BB-merged.flat.gff.bed