-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathUtils.r
151 lines (138 loc) · 6.7 KB
/
Utils.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#functions here
#For some of the measurements we have repeats. I think mainly Charles did this in order to see the batch effects.
#However this is probably unnecessary if the data are truly randomized, and then we can perform the best batch
#correction using Combat or something like that
#The input matrix should have the first column as the participant and the second column as the timepoint
averaging_replicates <- function(fiber_subsetted_dataframe) {
df <- data.frame(fiber_subsetted_dataframe)
colnames(df)[1] <- "Participant"
colnames(df)[2] <- "Time_point"
df[,3:ncol(df)] <- data.frame(lapply(df[,3:ncol(df)], as.character), stringsAsFactors=FALSE, row.names = rownames(df)) #To change to a double, this needs to go through character first
df[,3:ncol(df)] <- data.frame(lapply(df[,3:ncol(df)], as.numeric), stringsAsFactors=FALSE, row.names = rownames(df)) #This needs to be done for the transcript data, but causes problems with the pcl data.
df2 <- aggregate(df,list(Participant=df[,1], Visit=df[,2]), mean)
df2 <- data.frame(t(df2)) #popping out the analytes that have NAs
df2 <- na.omit(df2)
df2 <- data.frame(t(df2))
rownames(df2) <- paste(as.character(df2$Participant), as.character(df2$Visit),sep="_")
return(df2)
}
# A standatdized way of determining the correlations
correlations <- function(to_correlate_df) {
require(Hmisc)
cor <- rcorr(format(t(to_correlate_df),digits=20), type="spearman")
cor.data <- cor$r
cor.data[upper.tri(cor.data, diag = T)]<- 0
pval.data <- cor$P
pval.data[upper.tri(pval.data, diag = T)]<- NA
FDR.data <- apply(pval.data,2,p.adjust,method="BH", n = length(pval.data))
pdf("./pval_bonferonni_hist.pdf")
hist(pval.data, breaks = 100, col="darkblue")
dev.off()
pdf("./FDR_bonferonni_hist.pdf")
hist(FDR.data, breaks = 100, col="darkblue")
dev.off()
pdf("./cor_bonferonni_hist.pdf")
hist(cor.data, breaks = 10, col="red")
dev.off()
cor.data[FDR.data > 0.05]=0
return(cor.data)
}
# A standard way of running the standardise funciton
standardise_data <- function(data_frame) {
require(Mfuzz)
n <- as.matrix(data_frame)
class(n) <- "numeric"
eset1 <- ExpressionSet(n)
eset1 <- standardise(eset1) #Running standarise
o <- exprs(eset1)
o <- na.omit(o)
return(o)
}
# This function will perform mean imputation
# For the knn imputation, I can just use the funciton impute.knn
mean_imputation <- function(combined_df){
imputed_df <- data.frame(matrix(nrow = 0, ncol = 6)) #creating a matrix for the imputed values. I originally ran it on the non-imputed values, but I think I might as well add them
require(Xmisc)
combined_df <- as.matrix(combined_df)
class(combined_df) <- "numeric"
for (i in 1:nrow(combined_df)) {
j <- combined_df[i,]
j[is.na(j)] <- mean(j, na.rm = TRUE)
imputed_df[i,] <- j
}
rownames(imputed_df) <- rownames(combined_df)
}
plotting_fuzzy_clusters <- function(eset, mfuzzcl){
require(Mfuzz)
xaxis_ticks = c("B","10","20","30","D3","D10")
par(mfrow=c(5,4),oma = c(5,4,1,0) + 0.1,
mar = c(2,1,1,1) + 0.1)
mfuzz.plot2(eset,cl=mfuzzcl,mfrow=c(4,4), ylim=c(-3,3),time.labels = xaxis_ticks, bg="white",ax.col="red",col.lab="black",col.main="green",col.sub="blue",col="blue", Xwidth=20, Xheight=20,colo="fancy", lwd=1,ylab='',xlab='',x11=FALSE,cex.main=1.1,cex.lab=0.1)
}
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) {
require(genefilter)
require(calibrate)
require(RColorBrewer)
rv = rowVars(assay(rld))
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select, ]))
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : "))
if (is.null(colors)) {
if (nlevels(fac) >= 3) {
colors = brewer.pal(nlevels(fac), "Paired")
} else {
colors = c("black", "red")
}
}
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1)
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1)
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)")
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)")
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...)
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx))
legend(legendpos, legend=levels(fac), col=colors, pch=20)
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld),
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours),
# terldt = list(levels(fac)), rep = FALSE)))
}
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.1, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) {
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...))
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...))
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...))
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...))
if (labelsig) {
require(calibrate)
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...))
}
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green"))
}
#Must be run with resdata, not res! This is because of the lab=Gene
maplot <- function (res, thresh=0.1, labelsig=TRUE, textcx=1, ...) {
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...))
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5))
if (labelsig) {
require(calibrate)
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2))
}
}
entrezid <- function( resdata ) {
require(EnsDb.Hsapiens.v79)
a = resdata$Gene #the column to iterate over will be different if I'm using res vs resdata
tmp=gsub("\\..*","",a)
tmp <- as.character(tmp)
txdb <- EnsDb.Hsapiens.v79
df <- AnnotationDbi::select(txdb, keys = tmp, keytype = "GENEID", columns = "ENTREZID")
df2 <- AnnotationDbi::select(txdb, keys = tmp, keytype = "GENEID", columns = "SYMBOL")
ENTREZID <- c()
SYMBOL <- c()
counter1 <- 0
for (i in tmp) {
counter1 <- counter1 + 1
j <- match(i,df$GENEID)
ENTREZID <- c(ENTREZID, toString(df[j,][2]))
SYMBOL <- c(SYMBOL, toString(df2[j,][2]))}
resdata$ENTREZID <- ENTREZID
resdata$SYMBOL <- SYMBOL
resdata$EnsemblGene <- tmp
resdata
}