-
Notifications
You must be signed in to change notification settings - Fork 1
/
genomics_tutorial.R
241 lines (182 loc) · 9.34 KB
/
genomics_tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
###########################################################################################################
### AI/ML POGO workshop - Ostend, Belgium, 20-22th May 2019
# Genomics - Anders Lanzén & Tristan Cordier
########################################### Setting the stage #############################################
# setting the working directory
setwd("/path/to/your/folder/POGO_ML_workshop_genomics")
# libraries we need
library('vegan')
library('matrixStats')
library('ranger')
library('irr')
library('BBI')
## for parralel CPU computation
library('doMC')
registerDoMC(cores = detectCores())
# customs functions
source("custom_functions/sml_compo.R")
source("custom_functions/ec_and_plot.R")
source("custom_functions/bar_plot.R")
### import our data
# the otu table
otu <- read.table("otu_table.tsv", header=TRUE, sep="\t", row.names=1)
# the metadata
met <- read.table("metadata.tsv", header=TRUE, sep="\t", dec = ".")
################################### 'Playing' and manipulating the data ###################################
### some constant variables handling
# to fecth easily the list of samples
s <- "samples_names"
# the cuttof to consider a sample
seq_depth_cutoff <- 10000
# check if empty rows or columns at the end of file
otu[(dim(otu)[1]-10):(dim(otu)[1]),1:3] ## normally there is nothing
otu[1:3,(dim(otu)[2]-10):(dim(otu)[2])] ## normally Taxa and X
# extract the OTU taxonomy assignments and total abundance from the OTU table as generated from the SLIM application (Dufresne et al., 2019)
taxo <- cbind(Total_abund = rowSums(otu[,c(1:(dim(otu)[[2]]-3))]), taxon = as.character(otu[,c(dim(otu)[[2]]-2)]))
### and keep the otu counts only (transposed: samples as rows, variables as columns)
otu <- data.frame(t(otu[,c(1:(dim(otu)[[2]]-3))]))
### making sure of the match between metadata and otu table
# are they there?
table(gsub("-",".",met[,s]) %in% rownames(otu))
# and do they match exactly?
table(gsub("-",".",met[,s]) == rownames(otu))
### keep the dataset at this point for coming back to it
otu_raw <- otu
### sum of samples
row_S <- rowSums(otu_raw)
### plot the sequencing depth
pdf("Sequencing_depth1.pdf", width=6,height=6)
plot(row_S, main="Sequencing depth - limit at 10,000 reads", ylab = "Reads counts", xlab = "sample ID", pch=16, cex=.4)
abline(h=10000, col="blue", lwd=2)
dev.off()
### plot the sequencing depth
pdf("Sequencing_depth2.pdf", width=6,height=6)
plot(log(row_S+1), main="Sequencing depth - limit at 10,000 reads", ylab = "log(Reads counts + 1)", xlab = "sample ID", yaxt="n", pch=16, cex=.4)
axis(2, at=log(c(10,100,1000,10000,100000,1000000)),labels=c(10,100,1000,10000,100000,1000000),cex.axis=0.7, las=2, ylim=log(c(10,1000000)))
abline(h=log(10000+1), col="blue", lwd=2)
dev.off()
## get samples with sequencing depth above the total average of reads per samples
otu <- subset(otu, row_S >= seq_depth_cutoff)
met <- subset(met, row_S >= seq_depth_cutoff)
## keep only samples for which we have metadata
otu <- subset(otu, is.na(met$AMBI) == F)
otu_raw <- subset(otu_raw, is.na(met$AMBI) == F)
met <- subset(met, is.na(met$AMBI) == F)
### normalize the counts (there is not "good" approach, see Weiss et al. Microbiome, 2017; Pereira et al. BMC Genomics, 2018)
# here let's do relative abundance..
OTU <- decostand(otu, method = "total")
OTUr <- otu
MET <- met
######### ready to go!
########################################### Real analysis begins here #############################################
############ Exploring big patterns
# ordination (NMDS)
mds <-metaMDS(OTU, distance = "bray", k = 2, binary = F)
# explore relations between ordinations and environmental parameters
env_dat <- MET[,c("Distance_cage", "AMBI", "NSI", "ISI", "NQI1", "Shannon")]
# plot the NMDS
pdf("NMDS1.pdf", width=12,height=7)
par(mar=c(4, 4, 4, 24), xpd=TRUE)
plot(mds$points, col=as.numeric(MET$col_plot), pch=MET$col_plot, main = "NMDS - Bray-Curtis matrix")
leg <- unique(as.vector(MET$Locality))
legend("topright", inset=c(-0.3,0), leg, col=unique(MET$col_plot), pch=unique(MET$col_plot), box.lty=0)
legend("topright", legend = paste("stress =", round(mds$stress, 3)), box.lty=0, cex=.8)
fit <- envfit(mds, env_dat, perm = 999, display = "sites", na.rm =T)
tab <- cbind(round(fit$vectors$r,3), fit$vectors$pvals)
plot(fit, p.max = 0.05, col = "chartreuse2")
dev.off()
# Focus on pollution gradient
col_gradient <- colorRampPalette(c("blue", "green","red"), bias=1, interpolate = "linear")(600)
pdf("NMDS2.pdf", width=12,height=7)
par(mar=c(4, 4, 4, 24), xpd=TRUE)
plot(mds$points, col=col_gradient[as.numeric(floor(MET$AMBI*100))], pch=16, main = "NMDS - Bray-Curtis matrix")
leg <- c("low pollution", "low to moderate", "moderate to high")
legend("topright", inset=c(-0.3,0), leg, col=c("blue", "green","red"), pch=16, box.lty=0)
dev.off()
# taxonomic barplot
# high taxonomic rank
pdf("Taxo_barplot1.pdf")
bar_plot(OTUr, MET, aggreg = c("Locality", "Station"), taxo_file = taxo, title_ = "By locality", font_size_ = 0.7, tax_rank = 4)
dev.off()
# high taxonomic rank
pdf("Taxo_barplot2.pdf")
bar_plot(OTUr, MET, aggreg = c("Locality"), taxo_file = taxo, title_ = "By locality", font_size_ = 0.7, tax_rank = 7, metazoan = T)
dev.off()
############ SML
# custom function to perform cross-validation on a fully labelled dataset
preds <- sml_compo(OTU, MET, index = "AMBI", algo = "RF", cross_val = "Locality")
# plot the independant predictions against the real values and collect stats
pdf("Predictions1.pdf", width=8,height=5)
res <- plot_ml(preds, MET, index = "AMBI", title = "Random Forest", aggreg = c("Grab", "Station", "Locality"))
dev.off()
# we can do that within a loop and export the plots
for (BI in c("AMBI", "NSI", "ISI", "NQI1", "Shannon"))
{
## SML on the 10 most abundant for doing it fast...
preds <- sml_compo(OTU[,1:10], MET, index = BI, algo = "RF", cross_val = "Locality")
## PLOT
plot_ml(preds, MET, index = BI, aggreg = c("Grab", "Station", "Locality"), pdf = T,
title = paste("Predictions_", BI, ".pdf", sep=""))
}
### Variables importance - model on the full dataset
mod <- ranger(MET[,"AMBI"] ~ ., data=OTU, mtry=floor(dim(OTU)[2]/3), num.trees = 300, importance= "impurity", write.forest = T)
imp <- tail(sort(mod$variable.importance), 50)
pdf("Variable_importance.pdf", width = 5, height=8)
p <- barplot(imp, horiz = T, xlab="Variable importance", main=paste("OTUs importance for AMBI"), las=2, cex.names = 0.6)
### look at these OTUs
tx <- taxo[names(imp),"taxon"]
taxo[names(imp),"taxon"]
## get the last rank
for (i in 1:length(tx)) tx[i] <- tail(unlist(strsplit(as.character(tx[i]), split=";", fixed=TRUE)), 1)
## and add it on the plot
text(0, p, labels=tx, pos=4, cex=0.6)
dev.off()
######### WORKSHOP day one: additional scripting.
## let's make feature selection to keep only OTUs with good predictive potential and test if this improve performance.
## the random forest algorithm had this feature built-in
## to test that (on only one farm in this exemple), we will make feature selection on a training set of
## of all the farm but Farm_1 and predict BI values on this hold-out farm.
TRAIN <- subset(OTU, MET$Locality != "Farm_1")
TEST <- subset(OTU, MET$Locality == "Farm_1")
MET_tr <- subset(MET, MET$Locality != "Farm_1")
MET_te <- subset(MET, MET$Locality == "Farm_1")
## let's train a model will all the features for the AMBI index
mod_all_feature <- ranger(MET_tr[,"AMBI"] ~ ., data=TRAIN, mtry=floor(dim(TRAIN)[2]/3), num.trees = 300, importance= "impurity", write.forest = T)
## sort the OTUs by decreasing importance
imp <- sort(mod_all_feature$variable.importance, decreasing = T)
## and we will see what it the effect of the amount of feature we keep on the accuracy of predictions, from 10 to all OTUs
feat <- c(10,20,30,40,50,100,250,500,1000, ncol(TRAIN))
## collect the R2 and kappa statistics
output <- array(NA, c(2, length(feat)))
colnames(output) <- feat
rownames(output) <- c("R2", "KAPPA")
##
for (i in feat)
{
# get the i most important OTUs
OTU_list <- names(imp[1:i])
train_reduced <- TRAIN[,OTU_list]
# and keep those OTUs only in the TEST dataset
test_reduced <- TEST[,OTU_list]
## train AMBI predictive model
mod <- ranger(MET_tr[,"AMBI"] ~ ., data=train_reduced, mtry=floor(dim(train_reduced)[2]/3), num.trees = 300, importance= "impurity", write.forest = T)
## now make prediction on the hold-out farm
preds <- predict(mod, test_reduced)
## get stats for each index
stats <- plot_ml(data = preds$predictions, metadata = MET_te, index = "AMBI", title = paste("RF_", i, "features", sep=""),
aggreg = c("Grab", "Station", "Locality"), taxo_group = paste("V1V2_", i, sep=""), pdf = T)
output["R2",paste(i)] <- stats$R2
output["KAPPA",paste(i)] <- stats$KAP
}
### then plot the accuracy as funciton of extracted feature
# remove the '*' from the output file
output[1,] <- as.numeric(gsub('*', '', output[1,], fixed = T))
output[2,] <- as.numeric(gsub('*', '', output[2,], fixed = T))
# and export as pdf
pdf("Features_extract.pdf", width = 5, height=5)
plot(output[1,], col="blue", cex.axis=.8,pch=1,type="o", ylim=as.numeric(c(min(output), max(output))), xaxt="none", ylab = "R2 - KAPPA",
xlab="Features extracted", main="Accuracy as function of extracted feature")
lines(as.numeric(output[2,]), type="o", pch=2, col="red")
axis(1, at=1:ncol(output), cex.axis=.8,lab=colnames(output))
legend("bottomright", rownames(output), cex=0.7, col=c("blue","red"), pch=1:2,bty = "n")
dev.off()