forked from liziyie/scDVA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprepare_utils.R
364 lines (349 loc) · 15.9 KB
/
dataprepare_utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# Load packages----
getPackage <- function(pkg, check = TRUE, load = TRUE, silent = FALSE, github = NULL) {
if(check) {
if(!suppressMessages(suppressWarnings(require(pkg, character.only = TRUE, quietly = TRUE)))) {
if(is.null(github)){
try(install.packages(pkg), silent = TRUE)
}
else{
try(remotes::install_github(github))
}
}
}
if(load) suppressPackageStartupMessages(library(pkg, character.only = TRUE, quietly = TRUE))
if(load & !silent) message("Loaded ", pkg)
}
packages <- c("dplyr","psych","Matrix", "limma")
lapply(packages, getPackage)
# Read the dataset map file----
dataset_map <<- read.csv("./data/dataset_map.csv", row.names = 1, head = T, stringsAsFactors = F)
# Transfer the dataset map to a list tree----
datasetmap2list <- function(dataset_map){
dataset_map <- dataset_map[dataset_map$DatasetName != "Initialize",]
all_datasource <- unique(dataset_map$DatasetSource)
dt <- NULL
for(i in 1:length(all_datasource)){
dt[all_datasource[i]] <- list(NULL)
all_datasetname <- unique(dataset_map[dataset_map$DatasetSource == all_datasource[i], "DatasetName"])
for(j in 1:length(all_datasetname)){
dt[[i]][all_datasetname[j]] <- list(all_datasetname[j])
}
dt[[i]] <- structure(dt[[i]], stid=i, stopened=TRUE, stclass='project')
}
dt <- structure(dt, stopend = TRUE)
return(dt)
}
dataset_tree <<- datasetmap2list(dataset_map)
# Main functions----
loadDataSet <- function(LoadData_selected) {
## Load the required dataset from the LoadData_selected parameter.
##
## Args:
#' @LoadData_selected: Record the dataset to load into the memory.
##
## Returns:
## A dataframe with two columns. The first column is the dataset name. The second column
## is the loading status, which could be "Failed", "Existed" or "Succeeded".
LoadData.status <- data.frame(Dataset = "Dataset", Status = "Status", stringsAsFactors = F)
for(dataset in LoadData_selected){
if(!exists(dataset_map[dataset,"Expression"])){
expression.filepath <- sprintf("./data/%s.rda", dataset_map[dataset,"Expression"])
metadata.filepath <- sprintf("./data/%s.rda", dataset_map[dataset,"Metadata"])
if(file.exists(expression.filepath) & file.exists(metadata.filepath)){
load(expression.filepath, envir = .GlobalEnv)
load(metadata.filepath, envir = .GlobalEnv)
LoadData.status <- rbind(LoadData.status, c(dataset, "Succeeded"))
}else{
LoadData.status <- rbind(LoadData.status, c(dataset, "Failed"))
}
}else{
LoadData.status <- rbind(LoadData.status, c(dataset, "Existed"))
}
}
LoadData.status <- LoadData.status[-1,]
return(LoadData.status)
}
getPanelList <- function(SelectData_dataset) {
## Get the panel list from the dataset selected.
##
## Args:
#' @SelectData_dataset: Dataset used for visulization.
##
## Returns:
## A list including Global_Cluster, Sub_Cluster, Tissue, Sample, Treatment and
## Day list. Sub_Cluster is also a list while the others are vectors.
metadata_used <- get(dataset_map[SelectData_dataset, "Metadata"])
SubCluster_list <- list()
for (global_cluster in levels(as.factor(metadata_used[,"Global_Cluster"]))) {
SubCluster_list[[global_cluster]] <-
metadata_used %>% dplyr::filter(Global_Cluster == global_cluster) %>% pull(Sub_Cluster) %>% as.factor() %>% levels()
}
Tissue_list <- metadata_used %>% pull(Tissue) %>% as.factor() %>% levels()
Sample_list <- metadata_used %>% pull(Sample) %>% as.factor() %>% levels()
Treatment_list <- metadata_used %>% pull(Treatment) %>% as.factor() %>% levels()
Day_list <- metadata_used %>% pull(Day) %>% as.factor() %>% levels()
SubsetData_panel_list <- list(
GlobalCluster = names(SubCluster_list),
SubCluster_list = SubCluster_list,
Tissue = Tissue_list,
Sample = Sample_list,
Treatment = Treatment_list,
Day = Day_list
)
return(SubsetData_panel_list)
}
getGeneList <- function(SelectData_dataset, SelectData_normalization) {
## Get all the genes saved in the expression data from the platform input.
##
## Args:
#' @SelectData_dataset: Data is used for visulization.
#' @SelectData_normalization: "tpm" or "counts".
##
## Returns:
## A vector of gene names in the expression data.
GeneList <- row.names(get(dataset_map[SelectData_dataset,"Expression"])[[SelectData_normalization]])
return(GeneList)
}
checkGeneList <- function(Genes, SelectData_dataset, SelectData_normalization) {
## Check whether all the genes input are in the expression data from the
## platform input.
##
## Args:
#' @Genes: A long character including the genes input, should be separated by ",".
#' @SelectData_dataset: Data is used for visulization.
#' @SelectData_normalization: "tpm" or "counts".
##
## Returns:
## A list including the gene list info: genes input after removing the comma,
## genes number, whethere all the genes are avaliable the expression data
## right genes name and the wrong genes name.
Genes <- toupper(Genes) # All the genes input are converted to uppercase.
genes_input <- unlist(strsplit(Genes, ",| |\n|\t"))
genes_input <- genes_input[genes_input != ""]
all_genes_avaliable_flag = 1
right_gene <- c()
wrong_gene <- c()
for (gene in genes_input) {
if (!(gene %in% getGeneList(SelectData_dataset, SelectData_normalization))) {
all_genes_avaliable_flag = 0
wrong_gene <- c(wrong_gene, gene)
}else{
right_gene <- c(right_gene, gene)
}
}
GeneListInfo <- list(
genes_input = toupper(genes_input),
all_genes_avaliable_flag = all_genes_avaliable_flag,
gene_number = length(right_gene),
right_gene = right_gene,
wrong_gene = wrong_gene
)
return(GeneListInfo)
}
getPlotData <- function(Genes,
SelectData_dataset, SelectData_normalization,
GlobalCluster.selected, SubCluster.selected,
Sample.selected, Tissue.selected,
Treatment.selected, Day.selected) {
## Get the data frame used or plot.
##
## Args:
#' @Genes: A long character including the genes input, should be separated
## by ",".
#' @SelectData_dataset: Data is used for visulization.
#' @SelectData_normalization: "tpm" or "counts"
#' @GlobalCluster.selected: A vector including the Global_Cluster to use.
#' @SubCluster.selected: A vector including the Sub_Cluster to use, the data used
#' was the intersection of GlobalCluster.selected and SubCluster.selected.
#' @Sample.selected: A vector including the patients to use.
#' @Tissue.selected: A vector including the tissues to use.
#' @Treatment.selected: A vector including the tretments to use.
#' @Day.selected: A vector including the days to use.
##
## Returns:
## A data frame including the gene expression data and patients' metadata.
## When multiple genes as input, the expression data was geometric mean of
## the tpm + 1. The expression for each gene was also in the return data.
##
Metadata_PlotData <- get(dataset_map[SelectData_dataset,"Metadata"]) %>%
filter(Sample %in% Sample.selected, Tissue %in% Tissue.selected, Global_Cluster %in% GlobalCluster.selected,Sub_Cluster %in% SubCluster.selected, Treatment %in% Treatment.selected, Day %in% Day.selected)
row.names(Metadata_PlotData) <- Metadata_PlotData$CellName
GenesInfo <- checkGeneList(Genes, SelectData_dataset, SelectData_normalization)
Expression_PlotData <- as.matrix(get(dataset_map[SelectData_dataset,"Expression"])[[SelectData_normalization]][GenesInfo$right_gene, Metadata_PlotData$CellName])
if (GenesInfo$gene_number > 1) {
Mean_expression <- apply(Expression_PlotData + 1, 2,
psych::geometric.mean) - 1
PlotData <- data.frame(
Expression = Mean_expression, Metadata_PlotData,
t(Expression_PlotData), check.names = F
)
}else if(GenesInfo$gene_number == 1){
PlotData <- data.frame(
Expression = Expression_PlotData, Metadata_PlotData,
input_gene = Expression_PlotData)
names(PlotData)[names(PlotData) == "input_gene"] = GenesInfo$right_gene
}
return(PlotData)
}
getSigData <- function(Plot.data, Group.by, Per.cutoff) {
## Get the data frame used or gene significance calculation.
##
## Args:
#' @Plot.data: The plot data created from getPlotData function.
#' @Group.by: To calculated the signifance between which groups, should be one
#' of a categorical variable in Plot.data, such as "Global_Cluster",
#' "Sub_Cluster", "Patient" or "Tissue", "Treatment", "Day".
#' @Per.cutoff: The cutoff of a gene seen as expressed in the cells.
##
## Returns:
## A list including the result of gene significance calculation. Group_def is
## the group name matched with the group number. Fvalue, coeff and pvalue is
## the ANOVA and tukeyHSD results. Percentage is the percentage of cells with
## the gene expression in each group.
##
Expression.Per <- function(x) {
return(sum(x > Per.cutoff) / length(x))
}
group.cha.id <- Plot.data[, Group.by]
group.cha <- levels(as.factor(group.cha.id))
ngroup <- length(group.cha)
group_mapping <- data.frame(
Group.cha = group.cha,
Group.num = paste0("Grp", formatC(
1:ngroup, width = 2, flag = "0"
)),
row.names = group.cha,
stringsAsFactors = F
) # The group number id - group character id
group.num.id <- group_mapping[as.character(group.cha.id), "Group.num"]
temp <- data.frame(
exprs = Plot.data$Expression,
group = group.num.id,
stringsAsFactors = F
) # Use the group number id
exp.percent <- round(c(aggregate(temp$exprs, by = list(temp$group),
Expression.Per)$x), 3)
exp.mean <- c(aggregate(temp$exprs, by = list(temp$group),
mean)$x)
exp.sd <- c(aggregate(temp$exprs, by = list(temp$group),
sd)$x)
row.names(group_mapping) <- group_mapping[, "Group.num"]
group_mapping[, "Exp.percent"] <- exp.percent # Expressed cells percentage
group_mapping[, "Exp.mean"] <- exp.mean # Mean expression in each cluster
group_mapping[, "Exp.sd"] <- exp.sd # Sd of the expression in each cluster
if(length(unique(temp$group)) > 1){
fml <- aov(exprs ~ group, data = temp)
out.tukeyHSD <- TukeyHSD(fml)$group # Do ANOVA
Fvalue <- c(summary(fml)[[1]][1, 'F value'], summary(fml)[[1]][1, 'Pr(>F)'])
names(Fvalue) <- c("F value", "Pr(>F)")
coeff <- matrix(1, nrow = ngroup, ncol = ngroup)
row.names(coeff) <- levels(as.factor(group.num.id))
colnames(coeff) <- levels(as.factor(group.num.id))
pvalue <- matrix(0, nrow = ngroup, ncol = ngroup)
row.names(pvalue) <- levels(as.factor(group.num.id))
colnames(pvalue) <- levels(as.factor(group.num.id))
percentage <- matrix(diag(exp.percent), nrow = ngroup, ncol = ngroup)
row.names(percentage) <- levels(as.factor(group.num.id))
colnames(percentage) <- levels(as.factor(group.num.id)) # Initalize the matrix
for (i in 1:nrow(out.tukeyHSD)) {
lab1_name <- strsplit(row.names(out.tukeyHSD)[i], "-")[[1]][1]
lab2_name <- strsplit(row.names(out.tukeyHSD)[i], "-")[[1]][2]
coeff[lab1_name, lab2_name] <- out.tukeyHSD[i, "diff"]
coeff[lab2_name, lab1_name] <- 0
pvalue[lab1_name, lab2_name] <- out.tukeyHSD[i, "p adj"]
pvalue[lab2_name, lab1_name] <- 1
} # Fill the matrix
result <- list(
group_def = group_mapping,
Fvalue = Fvalue,
coeff = coeff,
pvalue = pvalue,
percentage = percentage
)
return(result)
}
}
getLIMMAData <- function(Plot.data, SelectData_dataset, SelectData_normalization, genes, x.cutoff, y.cutoff, group1, group2) {
## Get the data frame used or gene significance calculation.
##
## Args:
#' @Plot.data: The plot data created from getPlotData function.
#' @genes: The genes used to do in-silico FACS.
#' @x.cutoff: The cutoff of x axis.
#' @y.cutoff: The cutoff of y axis.
#' @group1: The first group used to calculate DEGenes.
#' @group2: The second group used to calculate DEGenes.
##
## Returns:
## A table with the result of gene significance calculation. The differentially
## expressed genes are calculated by LIMMA.
##
de.genes.all <- data.frame(Warning = "There should be two genes as input!", logFC = 0, adj.P.Val = 0)
if(length(genes) == 2){
gene.A <- genes[1]
gene.B <- genes[2]
Plot.data$FACS_groups <- "Group1"
Plot.data[Plot.data[,gene.A] <= x.cutoff & Plot.data[,gene.B] <= y.cutoff, "FACS_groups"] <- "Group2"
Plot.data[Plot.data[,gene.A] > x.cutoff & Plot.data[,gene.B] > y.cutoff, "FACS_groups"] <- "Group3"
Plot.data[Plot.data[,gene.A] > x.cutoff & Plot.data[,gene.B] <= y.cutoff, "FACS_groups"] <- "Group4"
de.genes.all <- data.frame(Warning = "Two groups should not be the same one!", logFC = 0, adj.P.Val = 0)
if(group1 != group2){
group1.used <- Plot.data %>% filter(FACS_groups == group1) %>% pull(CellName)
if(sum(Plot.data$FACS_groups == group1) > 1000){
group1.used <- sample(group1.used, 1000)
}
group2.used <- Plot.data %>% filter(FACS_groups == group2) %>% pull(CellName)
if(sum(Plot.data$FACS_groups == group2) > 1000){
group2.used <- sample(group2.used, 1000)
}
Plot.data <- Plot.data[Plot.data$CellName %in% c(group1.used, group2.used),]
de.genes.all <- data.frame(Warning = "There should be at least one cell in each group!", logFC = 0, adj.P.Val = 0)
if(length(unique(Plot.data$FACS_groups)) == 2){
Expression_Data <- as.matrix(get(dataset_map[SelectData_dataset,"Expression"])[[SelectData_normalization]][, Plot.data$CellName])
expression_matrix <- as.matrix(Expression_Data)
expression_matrix <- expression_matrix[!duplicated(row.names(expression_matrix)),]
groupid <- as.character(Plot.data$FACS_group)
contrast <<- paste0(levels(factor(groupid)), collapse = "-")
design <- model.matrix( ~ 0 + factor(groupid))
colnames(design) <- levels(factor(groupid))
rownames(design) <-
colnames(expression_matrix) # design data used in limma
contrast.matrix <- makeContrasts(contrast, levels = design)
fit <- lmFit(expression_matrix, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
tempOutput <- topTable(fit2, coef = 1, n = Inf)
nrDEG <- na.omit(tempOutput)
nrDEG$Symbol <- row.names(nrDEG)
positive_group <-
row.names(fit2$contrasts)[fit2$contrasts == 1] # high expression when logFC > 0
negative_group <-
row.names(fit2$contrasts)[fit2$contrasts == -1] # low expression when logFC < 0
nrDEG$Grp <-
c(negative_group, positive_group)[as.numeric(nrDEG$logFC > 0) + 1]
cell.Grp1 <- which(groupid == levels(as.factor(groupid))[1])
cell.Grp2 <- which(groupid == levels(as.factor(groupid))[2])
Exp.Mean.Grp1 <-
rowMeans(expression_matrix[nrDEG$Symbol, cell.Grp1]) # mean expression in the group
Exp.Mean.Grp2 <-
rowMeans(expression_matrix[nrDEG$Symbol, cell.Grp2]) # mean expression out the group
Exp.Per.Grp1 <-
apply(expression_matrix[nrDEG$Symbol, cell.Grp1], 1, Expression.Per) # expression percentage in the group
Exp.Per.Grp2 <-
apply(expression_matrix[nrDEG$Symbol, cell.Grp2], 1, Expression.Per) # expression percentage out the group
de.genes.all <- cbind(nrDEG, Exp.Mean.Grp1, Exp.Per.Grp1, Exp.Mean.Grp2, Exp.Per.Grp2)
colnames(de.genes.all)[9:12] <-
c(paste0(c("Exp.Mean.", "Exp.Per."), levels(as.factor(groupid))[1]), paste0(c("Exp.Mean.", "Exp.Per."), levels(as.factor(groupid))[2]))
if (!is.na(de.genes.all[1, 1])) {
de.genes.all <- de.genes.all %>% arrange(desc(logFC))
row.names(de.genes.all) <- de.genes.all$Symbol
}
}
}
}
return(de.genes.all)
}
Expression.Per <- function(x, cutoff = 0.1) {
# percent of gene-expressed cell, cutoff = 0.1
return(sum(x > cutoff) / length(x))
}