-
Notifications
You must be signed in to change notification settings - Fork 1
/
GSE194040_Analysis.Rmd
296 lines (219 loc) · 13.5 KB
/
GSE194040_Analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
---
title: GSE194040 Notebook"
output: GSE194040
---
```{r}
library(GEOquery)
library(affy)
library(dplyr)
library(tidyverse)
library(survival)
library(survminer)
library(jetset)
library(readxl)
library(ggstatsplot)
library(ggplot2)
library(preprocessCore)
```
```{r}
# Load the gene expression dataset from the GEO database with accession number GSE194040
GSE194040 = getGEO("GSE194040")
# Combine metadata from two different series matrix files
meta_GSE19 = rbind(GSE194040[["GSE194040-GPL20078_series_matrix.txt.gz"]]@phenoData@data,
GSE194040[["GSE194040-GPL30493_series_matrix.txt.gz"]]@phenoData@data)
# Filter the metadata to select specific samples based on certain criteria
meta_GSE19 = filter(meta_GSE19, `her2:ch1` == 0, `hr:ch1` == 0)
meta_GSE19_immune = meta_GSE19[grep("Pembro", meta_GSE19$`arm:ch1`),]
meta_GSE19_pac = meta_GSE19[meta_GSE19$`arm:ch1` == "Paclitaxel",]
# Read the gene expression data from a file and prepare it for further analysis
exprs_GSE19 = read.table("GSE194040_expression_data.txt", sep = '\t', header = TRUE)
rownames(exprs_GSE19) = exprs_GSE19[,1]
exprs_GSE19 = exprs_GSE19[,-1]
# Export the gene expression data for CIBERSORT analysis
exprs_GSE19 = exprs_GSE19[,na.omit(match(paste0("X", meta_GSE19$`patient id:ch1`), colnames(exprs_GSE19)))]
exprs_GSE19 = as.data.frame(exprs_GSE19)
exprs_GSE19 <- exprs_GSE19[!(rownames(exprs_GSE19) == ""), , drop = FALSE]
exprs_GSE19 <- exprs_GSE19[!grepl("^AG_", rownames(exprs_GSE19)),]
exprs_GSE19 = na.omit(exprs_GSE19)
# Adjust the gene expression data for different analytical purposes
exprs_GSE19_immune = exprs_GSE19[,match(paste0("X", meta_GSE19_immune$`patient id:ch1`), colnames(exprs_GSE19))]
exprs_GSE19_pac = exprs_GSE19[,na.omit(match(paste0("X", meta_GSE19_pac$`patient id:ch1`), colnames(exprs_GSE19)))]
```
Perform a mann-whitney/wilcoxon rank sum test for the validated genes
(1) Patients Treated with Immunotherapy
```{r}
# Set the working directory to the location of the data files
#setwd("destination of validated files produced by the validation files")
setwd("GEOValidated")
# Read in validated genes from various CSV files
Validated_Activated_DEG_DEMs <- read.csv("Validated_Activated_DEGs_lfc10_DEMs_118_93_nocox.csv")
Validated_Silenced_DEGs_DEMs <- read.csv("Validated_Silenced_DEGs_lfc10_DEMs_118_93_nocox.csv")
Validated_Activated_DEGs_DMGs <- read.csv("Validated_Activated_DEGs_lfc10_DMGs_118_93_nocox.csv")
Validated_Silenced_DEGs_DMGs <- read.csv("Validated_Silenced_DEGs_lfc10_DMGs_118_93_nocox.csv")
# Combine validated genes from different sources into one dataframe
validated_genes <- rbind(Validated_Activated_DEG_DEMs, Validated_Silenced_DEGs_DEMs,
Validated_Activated_DEGs_DMGs, Validated_Silenced_DEGs_DMGs)
# Extract unique genes from the combined dataframe
validated_genes <- validated_genes[!duplicated(validated_genes$Gene),2]
validated_genes <- data.frame(Gene = validated_genes)
# Merge validated genes with immune-related gene expression data
exprs_GSE19_immune_selected <- merge(x = validated_genes, y = exprs_GSE19_immune,
by.x = "Gene", by.y = 0)
exprs_GSE19_immune_selected <- exprs_GSE19_immune_selected[, -1, drop = FALSE] |>
`rownames<-`(exprs_GSE19_immune_selected[, "Gene"])
# Transpose the merged dataframe
exprs_GSE19_immune_selected <- data.frame(t(exprs_GSE19_immune_selected))
# Verify the consistency of sample IDs between metadata and gene expression data
consistent_ids <- all(paste0("X", meta_GSE19_immune$`patient id:ch1`) == rownames(exprs_GSE19_immune_selected))
# Combine clinical status data with the selected gene expression data
data_immune <- cbind(status = meta_GSE19_immune$`pcr:ch1`, exprs_GSE19_immune_selected)
data_immune$status=gsub("1","PCR",data_immune$status)
data_immune$status=gsub("0","non-PCR",data_immune$status)
# Perform statistical analysis and generate plots for each gene
dir.create("GSE19\\Plots\\immune", recursive = TRUE)
for(i in 2:length(colnames(data_immune))){
if(i==2) {setwd("GSE19\\Plots\\immune")}
# Generate a plot for each gene comparing expression between different clinical statuses
formula <- substitute(ggbetweenstats(data = data_immune, x = status, y = y_var, type = "np"),
list(y_var = colnames(data_immune)[i]))
eval(formula)
colnames(data_immune)[i] <- gsub("/", ".", colnames(data_immune)[i])
plot_filename <- paste("plots", colnames(data_immune)[i], ".png", sep = "")
ggsave(plot_filename)
}
# Separate data based on clinical status for further analysis
data_immune_pcr <- data_immune[data_immune$status == "PCR",]
data_immune_nonpcr <- data_immune[data_immune$status == "non-PCR",]
# Perform statistical tests to compare gene expression between clinical groups
result_of_test_immune_gse19 <- data.frame(matrix(NA, nrow = 1, ncol = 2))
for(i in 2:length(colnames(data_immune))){
whitney_output <- wilcox.test(data_immune_pcr[,i], data_immune_nonpcr[,i], paired = FALSE)
result_of_test_immune_gse19[i-1,1] <- colnames(data_immune_pcr)[i]
result_of_test_immune_gse19[i-1,2] <- whitney_output$p.value
# Additional columns can be added for other statistical measures if needed
}
```
(2) Patients Treated with Paclitaxel Alone
```{r}
# Merge validated genes with gene expression data from the Paclitaxel treatment group
exprs_GSE19_pac_selected <- merge(x = validated_genes, y = exprs_GSE19_pac,
by.x = "Gene", by.y = 0)
# Set gene names as row names and remove redundant gene ID column
exprs_GSE19_pac_selected <- exprs_GSE19_pac_selected[, -1, drop = FALSE] |>
`rownames<-`(exprs_GSE19_pac_selected[, "Gene"])
# Transpose the merged dataframe for further analysis
exprs_GSE19_pac_selected <- data.frame(t(exprs_GSE19_pac_selected))
# Re-ordering Match metadata with the selected gene expression data based on sample IDs
meta_GSE19_pac <- meta_GSE19_pac[match(rownames(exprs_GSE19_pac_selected),
paste0("X", meta_GSE19_pac$`patient id:ch1`)),]
# Verify the consistency of sample IDs between metadata and gene expression data
consistent_ids <- all(paste0("X", meta_GSE19_pac$`patient id:ch1`) == rownames(exprs_GSE19_pac_selected))
# Combine clinical status data with the selected gene expression data for the Paclitaxel treatment group
data_pac_gse19 <- cbind(status = meta_GSE19_pac$`pcr:ch1`, exprs_GSE19_pac_selected)
# Separate data based on clinical status for further analysis
data_pac_pcr_gse19 <- data_pac_gse19[data_pac_gse19$status == "PCR",]
data_pac_nonpcr_gse19 <- data_pac_gse19[data_pac_gse19$status == "non-PCR",]
# Perform statistical tests to compare gene expression between clinical groups in the Paclitaxel treatment group
result_of_test_pac_gse19 <- data.frame(matrix(NA, nrow = 1, ncol = 2))
for(i in 2:length(colnames(data_pac_pcr_gse19))){
whitney_output <- wilcox.test(data_pac_pcr_gse19[,i], data_pac_nonpcr_gse19[,i], paired = FALSE)
result_of_test_pac_gse19[i-1,1] <- colnames(data_pac_pcr_gse19)[i]
result_of_test_pac_gse19[i-1,2] <- whitney_output$p.value
# Additional columns can be added for other statistical measures if needed
}
data_pac_gse19$status=gsub("1","PCR",data_pac_gse19$status)
data_pac_gse19$status=gsub("0","non-PCR",data_pac_gse19$status)
# Generate plots for each gene in the Paclitaxel treatment group comparing expression between different clinical statuses
dir.create("GSE19\\Plots\\pac", recursive = TRUE)
for(i in 2:length(colnames(data_pac_gse19))){
if(i==2) {setwd("GSE19\\Plots\\pac")}
formula <- substitute(ggbetweenstats(data = data_pac_gse19, x = status, y = y_var, type = "np"),
list(y_var = colnames(data_pac_gse19)[i]))
eval(formula)
colnames(data_pac_gse19)[i] <- gsub("/", ".", colnames(data_pac_gse19)[i])
plot_filename <- paste("plots", colnames(data_pac_gse19)[i], ".png", sep = "")
ggsave(plot_filename)
}
```
```{r}
# Import CIBERSORT values from the CSV file
CIBERSORT_GSE19 = read.table("CIBERSORTx_GSE19_notquantile_nobatch.csv", sep = ',', header = TRUE) #no batch effect, no quantile normalization
# Subset CIBERSORT data for immune samples based on patient IDs
CIBERSORT_GSE19_immune = CIBERSORT_GSE19[match(paste0("X", meta_GSE19_immune$`patient id:ch1`), CIBERSORT_GSE19$Mixture),]
# Subset CIBERSORT data for PAC (Pancreatic Cancer) samples based on patient IDs
CIBERSORT_GSE19_pac = CIBERSORT_GSE19[match(paste0("X", meta_GSE19_pac$`patient id:ch1`), CIBERSORT_GSE19$Mixture),]
# Check if the order of patient IDs matches between metadata and CIBERSORT data for immune samples
all(paste0("X", meta_GSE19_immune$`patient id:ch1`) == CIBERSORT_GSE19_immune$Mixture)
# Check if the order of patient IDs matches between metadata and CIBERSORT data for PAC samples
all((paste0("X", meta_GSE19_pac$`patient id:ch1`) == CIBERSORT_GSE19_pac$Mixture))
# Combine status information with CIBERSORT data for immune samples
ciber_immune_gse19 = cbind(status = meta_GSE19_immune$`pcr:ch1`, CIBERSORT_GSE19_immune)
# Combine status information with CIBERSORT data for PAC samples
ciber_pac_gse19 = cbind(status = meta_GSE19_pac$`pcr:ch1`, CIBERSORT_GSE19_pac)
# Apply Mann-Whitney test for PAC samples
ciber_pac_pcr_gse19 = ciber_pac_gse19[ciber_pac_gse19$status == "PCR",]
ciber_pac_nonpcr_gse19 = ciber_pac_gse19[ciber_pac_gse19$status == "non-PCR",]
result_of_cibertest_pac_gse19 = data.frame(matrix(NA, nrow = 1, ncol = 2))
# Loop through each CIBERSORT feature and perform Mann-Whitney test for PAC samples
for(i in 3:length(colnames(ciber_pac_pcr_gse19))){
whitney_output <- wilcox.test(ciber_pac_pcr_gse19[,i], ciber_pac_nonpcr_gse19[,i], paired = FALSE)
result_of_cibertest_pac_gse19[i - 2, 1] = colnames(ciber_pac_pcr_gse19)[i]
result_of_cibertest_pac_gse19[i - 2, 2] = whitney_output$p.value
# Additional columns can be added for other statistics if needed
}
# Apply Mann-Whitney test for immune samples
ciber_immune_pcr_gse19 = ciber_immune_gse19[ciber_immune_gse19$status == "PCR",]
ciber_immune_nonpcr_gse19 = ciber_immune_gse19[ciber_immune_gse19$status == "non-PCR",]
result_of_cibertest_immune_gse19 = data.frame(matrix(NA, nrow = 1, ncol = 2))
# Loop through each CIBERSORT feature and perform Mann-Whitney test for immune samples
for(i in 3:length(colnames(ciber_immune_pcr_gse19))){
whitney_output <- wilcox.test(ciber_immune_pcr_gse19[,i], ciber_immune_nonpcr_gse19[,i], paired = FALSE)
result_of_cibertest_immune_gse19[i - 2, 1] = colnames(ciber_immune_pcr_gse19)[i]
result_of_cibertest_immune_gse19[i - 2, 2] = whitney_output$p.value
# Additional columns can be added for other statistics if needed
}
```
```{r}
library(ggstatsplot)
ciber_immune_gse19$status=gsub("1","PCR",ciber_immune_gse19$status)
ciber_immune_gse19$status=gsub("0","non-PCR",ciber_immune_gse19$status)
ciber_pac_gse19$status=gsub("1","PCR",ciber_pac_gse19$status)
ciber_pac_gse19$status=gsub("0","non-PCR",ciber_pac_gse19$status)
# Loop through each column (CIBERSORT feature) in the CIBERSORT data for immune samples
dir.create("GSE19\\Plots\\immune\\cibersort\\",recursive=TRUE)
for(i in 3:length(colnames(ciber_immune_gse19))){
# Set the working directory for saving plots related to immune samples
if(i==3) {setwd("GSE19\\Plots\\immune\\cibersort\\")}
# Generate the formula for ggstatsplot function dynamically, substituting the current column name
formula <- substitute(ggbetweenstats(data = ciber_immune_gse19, x = status, y = y_var, type = "np"), list(y_var = colnames(ciber_immune_gse19)[i]))
# Evaluate the formula to generate the plot using ggstatsplot
eval(formula)
# Replace any forward slashes in the column name with dots to avoid file system issues
colnames(ciber_immune_gse19)[i] = gsub("/", ".", colnames(ciber_immune_gse19)[i])
# Define the filename for saving the plot
plot_filename <- paste("plots", colnames(ciber_immune_gse19)[i], "_noquantile_nobatch.png", sep = "")
# Save the plot using ggsave
ggsave(plot_filename)
}
library(ggstatsplot)
# Loop through each column (CIBERSORT feature) in the CIBERSORT data for PAC samples
dir.create("GSE19\\Plots\\pac\\cibersort\\",recursive=TRUE)
for(i in 3:length(colnames(ciber_pac_gse19))){
# Set the working directory for saving plots related to PAC samples
if(i==3) { setwd("GSE19\\Plots\\pac\\cibersort\\")}
# Generate the formula for ggstatsplot function dynamically, substituting the current column name
formula <- substitute(ggbetweenstats(data = ciber_pac_gse19, x = status, y = y_var, type = "np"), list(y_var = colnames(ciber_pac_gse19)[i]))
# Evaluate the formula to generate the plot using ggstatsplot
eval(formula)
# Replace any forward slashes in the column name with dots to avoid file system issues
colnames(ciber_pac_gse19)[i] = gsub("/", ".", colnames(ciber_pac_gse19)[i])
# Define the filename for saving the plot
plot_filename <- paste("plots", colnames(ciber_pac_gse19)[i], "_noquantile_nobatch.png", sep = "")
# Save the plot using ggsave
ggsave(plot_filename)
}
write.table(result_of_cibertest_immune_gse19,"result_of_cibertest_immune_gse19.tsv",sep='\t')
write.table(result_of_cibertest_pac_gse19,"result_of_cibertest_pac_gse19.tsv",sep='\t')
write.table(result_of_test_immune_gse19,'result_of_test_immune_gse19.tsv',sep='\t')
write.table(result_of_test_pac_gse19,'result_of_test_pac_gse19.tsv',sep='\t')
```