-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcpm.r
297 lines (248 loc) · 13.1 KB
/
cpm.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
## CONNECTOME-BASED PREDICTION MODEL WITH CV-TUNED P-VALUES
## FOR USE IN THE COGNITIVE AND BRAIN HEALTH LABORATORY
##################################################################################################################
##################################################################################################################
##to train CPM models using a fixed p value threshold
cpm.train=function(data,outcome,p=0.05)
{
##checks
#pvalues
data=data.matrix(data)
if(length(p)==1) {p.posneg=c(p,p)}
else if (length(p)==2) {p.posneg=p}
else {stop("Only one or two pvalues should be entered")}
#number of rows match
if(NROW(data)!=NROW(outcome)) {stop(paste("\nThe number of rows in the data (",NROW(data),") and outcome variable (",NROW(outcome),") do not match!\n",sep=""))}
#missing data
idx.missing=which(is.na(outcome)==T)
if(NROW(idx.missing)>0)
{
cat(paste("\n",NROW(idx.missing)," missing values are detected in the outcome variable. Subjects with missing values will be excluded in the training procedure\n",sep=""))
data=data[-idx.missing,]
outcome=outcome[-idx.missing]
}
##feature selection
#critical values
pos.tcrit=qt((p.posneg[1]/2), NROW(outcome)-2, lower=FALSE)
neg.tcrit=qt((p.posneg[2]/2), NROW(outcome)-2, lower=FALSE)
pos.rcrit=sqrt(pos.tcrit^2/(pos.tcrit^2+NROW(outcome)-2))
neg.rcrit=sqrt(neg.tcrit^2/(neg.tcrit^2+NROW(outcome)-2))
#binarizing pearsons r values
r.mat=cor(data,outcome)
weights=rep(0,NCOL(data))
weights[r.mat> pos.rcrit]=1
weights[r.mat< -neg.rcrit]=-1
##network models
#positive model
if(NROW(which(weights==1))>1) ## proceed only if at least 2 edges are selected
{
pos.netstr=rowSums(data[,which(weights==1)])
pos.netstr.coef=lm(outcome~pos.netstr)$coefficients
} else
{
pos.netstr.coef=NA
cat("\nNone of edges are significantly and positively correlated with the outcome. The positive network model cannot be constructed\n")
}
#negative model
if(NROW(which(weights==-1))>1) ## proceed only if at least 2 edges are selected
{
neg.netstr=rowSums(data[,which(weights==-1)])
neg.netstr.coef=lm(outcome~neg.netstr)$coefficients
} else
{
neg.netstr.coef=NA
cat("\nNone of edges are significantly and negatively correlated with the outcome. The negative network model cannot be constructed\n")
}
# positive + negative model
if(NROW(which(weights==-1))>1 & NROW(which(weights==1))>1) ## proceed only if at least 2 edges are selected in each of the earlier models
{both.netstr.coef=lm(outcome~pos.netstr+neg.netstr)$coefficients}
else {both.netstr.coef=NA}
##listing objects to return
model=list(weights,pos.netstr.coef,neg.netstr.coef,both.netstr.coef)
names(model)=c("weights","pos.network.coef","neg.network.coef","both.network.coef")
return(model)
}
##################################################################################################################
##################################################################################################################
##to predict scores from previously generated cpm.train() models
cpm.predict=function(model,test.data, network="both")
{
test.data=data.matrix(test.data)
##checks
#number of rows match
if(NROW(model$weights)!=NCOL(test.data))
{stop(paste("\nThe number of predictors in the training data (",NROW(model$weights),") and testing data(",NCOL(test.data),")do not match!\n",sep=""))}
##select model {compute predscore}
if(network=="positive") {predscore=rowSums(test.data[,which(model$weights==1)])*model$pos.network.coef[2]+model$pos.network.coef[1]}
else if(network=="negative") {predscore=rowSums(test.data[,which(model$weights==-1)])*model$neg.network.coef[2]+model$neg.network.coef[1]}
else if(network=="both")
{
#check if positive and negative models are valid, and proceed accordingly
if(is.na(model$pos.network.coef)[1]) {positive=rep(NA,NROW(test.data))}
else {positive=rowSums(test.data[,which(model$weights==1)])*model$pos.network.coef[2]+model$pos.network.coef[1]}
if(is.na(model$neg.network.coef)[1]) {negative=rep(NA,NROW(test.data))}
else {negative=rowSums(test.data[,which(model$weights==-1)])*model$neg.network.coef[2]+model$neg.network.coef[1]}
if(is.na(model$pos.network.coef)[1] | is.na(model$neg.network.coef)[1]) {both=rep(NA,NROW(test.data))}
else {both=rowSums(test.data[,which(model$weights==1)])*model$both.network.coef[2]+rowSums(test.data[,which(model$weights==-1)])*model$both.network.coef[3]+model$both.network.coef[1]}
predscore=data.frame(positive,negative,both)
}
return(predscore)
}
##################################################################################################################
##################################################################################################################
##cross-validation procedure to identify optimal p values for subsequent use of cpm.train()
cpm.train.cv=function(data,outcome,p,nfolds=5)
{
##checks
#require packages
list.of.packages = c("caret")
new.packages = list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages))
{
cat(paste("The following package(s) are required and will be installed:\n",new.packages,"\n"))
install.packages(new.packages)
}
#number of rows match
if(NROW(data)!=NROW(outcome)) {stop(paste("\nThe number of rows in the data (",NROW(data),") and outcome variable (",NROW(outcome),") do not match!\n",sep=""))}
#missing data
idx.missing=which(is.na(outcome)==T)
if(NROW(idx.missing)>0)
{
data=data[-idx.missing,]
outcome=outcome[-idx.missing]
}
##generate geometric sequence of p-values from distribution of p values in the data
if(missing(p))
{
#converting r to p
r_to_p=function(r)
{
t=(r*sqrt(NROW(outcome)-2))/(sqrt(1-(r^2)))
return(2 * (1 - pt(abs(t), NROW(outcome)-2)))
}
r.thresh=0.15 ##lower cutoff
r.mat=cor(data,outcome)
#separating p values for positive and negative rs
pos.r.mat=r.mat[r.mat>r.thresh]
neg.r.mat=r.mat[r.mat< (-r.thresh)]
##generating p values
#order distribution of p values in increasing magnitude
pos.r.mat=pos.r.mat[order(pos.r.mat)]
neg.r.mat=neg.r.mat[order(-neg.r.mat)]
#divide p values across 47 intervals
n.int=47
pos.interval=NROW(pos.r.mat)/n.int
neg.interval=NROW(neg.r.mat)/n.int
#select intervals from a geometric step progression; steps become progressively smaller
intervals=c(9,17,24,30,35,39,42,44,45)
p=matrix(NA,nrow=length(intervals)+1, ncol=2)
#first pair of p values set according to r=+/-0.15 (lower boundary)
p[1,]=c(r_to_p(r.thresh),r_to_p(-r.thresh))
#iterating p values across geometric stepped intervals for positive and negative models
p[2:NROW(p),1]=r_to_p(pos.r.mat[round(pos.interval*intervals)])
p[2:NROW(p),2]=r_to_p(neg.r.mat[round(neg.interval*intervals)])
} else
{
#checks for user-defined p values
if(length(p)==1) {stop("At least 2 p-values should be entered")}
else if (NCOL(p)==1) {p=cbind(p,p)}
}
##setup CV folds
folds=caret::createFolds(outcome,k=nfolds)
##training
rvals=matrix(NA,nrow=NROW(p),ncol=2)
for(iter in 1:NROW(p))
{
p.iter=p[iter,]
for (fold in 1:length(folds))
{
#leaving a fold out
train_outcome=outcome[-folds[[fold]]]
train_dat=data[-folds[[fold]],]
#training the CPM and applying model to predict scores
train.mod=cpm.train(data = train_dat,outcome = train_outcome,p=p.iter)
predscores.fold=cpm.predict(train.mod, test.dat=data[folds[[fold]],])
#saving the scores from each fold to a single vector
if(fold==1) {predscores.all=predscores.fold}
else {predscores.all=rbind(predscores.all,predscores.fold)}
}
#check if positive and negative models are valid, and proceed accordingly
if(anyNA(predscores.all[,1])) {pos.net.r=NA}
else {pos.net.r=cor(outcome[unlist(folds)],predscores.all[,1])}
if(anyNA(predscores.all[,2])) {neg.net.r=NA}
else {neg.net.r=cor(outcome[unlist(folds)],predscores.all[,2])}
#saving the r values for the iteration
rvals[iter,]=c(pos.net.r,neg.net.r)
}
#inform user if p values are too small such that no edges are selected; only if user-defined p values are entered
idx.NA.pos=which(is.na(rvals[,1]))
if(length(idx.NA.pos)>0)
{cat(paste("\nNote: No positive edges were selected when the p value of ",p[min(idx.NA.pos)]," or smaller was used", sep=""))}
idx.NA.neg=which(is.na(rvals[,2]))
if(length(idx.NA.neg)>0)
{cat(paste("\nNote: No negative edges were selected when the p value of ",p[min(idx.NA.neg)]," or smaller was used", sep=""))}
#identify the indices for p-values that produce the most accurate predictions
r.pos.min.idx=which(rvals[,1]==max(rvals[,1],na.rm = T))
r.neg.min.idx=which(rvals[,2]==max(rvals[,2],na.rm = T))
#listing out objects to return
results=list(c(p[r.pos.min.idx,1],p[r.neg.min.idx,2]),rvals,p)
names(results)=c("opt.pvals","results","pvals")
return(results)
}
##################################################################################################################
##################################################################################################################
##Using the lesion approach (leave-one-network out) for CPM
cpm.lesion=function(train.data,test.data,train.outcome, test.outcome,p)
{
data=data.matrix(dat_FC)
##atlas selection
labels.url=c("https://github.com/CogBrainHealthLab/VizConnectome/blob/main/labels/labelsSC_AAL90.csv?raw=TRUE",
"https://github.com/CogBrainHealthLab/VizConnectome/blob/main/labels/labelsFC_schaefer119.csv?raw=TRUE",
"https://github.com/CogBrainHealthLab/VizConnectome/blob/main/labels/labelsFC_schaefer219.csv?raw=TRUE",
"https://github.com/CogBrainHealthLab/VizConnectome/blob/main/labels/labelsFC_brainnetome_yeo.csv?raw=TRUE")
edge_lengths=c(4005,7021,23871,30135)
if(is.na(match(NCOL(data),edge_lengths)))
{stop("The number of columns in the input matrix is not consistent with any of the recognized parcellation schemes. The input matrix should contain 4005, 7021, 23871 or 30135 columns")}
else {atlas=match(NCOL(data),edge_lengths)}
##preparing atlas labels for removing networks of edges
label=read.csv(labels.url[atlas])
nnode=NROW(label)
networks.list=data.frame(unique(cbind(as.numeric(label$region),label$regionlabel)))
names(networks.list)=c("netno","network.name")
networks.list=networks.list[order(networks.list$netno),]
##CPM
results=matrix(NA,nrow=NROW(networks.list)+1,ncol=4)
#training the CPM (without any network exclusions) and applying model to predict scores
model.allnetworks=cpm.train(data=train.data, outcome=train.outcome, p=p)
pred.allnetwork=cpm.predict(model = model.allnetworks, test.data=test.data)
results[1,2:4]=cor(test.outcome,pred.allnetwork)
#CPM with one network removed each time
for (netno in 1:NROW(networks.list))
{
#identifying indice of edges to remove
FC_matrix=array(rep(NA,nnode^2),dim=c(nnode,nnode))
FC_matrix[upper.tri(FC_matrix, diag=FALSE)] = 1:edge_lengths[atlas]
FC_matrix.1net=FC_matrix
FC_matrix.1net[which(label$region==netno),which(label$region==netno)]=NA
edge.column=FC_matrix.1net[upper.tri(FC_matrix.1net, diag=FALSE)]
remove.idx=which(is.na(edge.column)==T)
#training the CPM and applying model to predict scores
model.1net=cpm.train(data=train.data[,-remove.idx], outcome=train.outcome, p=p)
pred.1net=cpm.predict(model = model.1net, test.data=test.data[,-remove.idx])
results[netno+1,2:4]=cor(test.outcome,pred.1net)
}
##saving results in a data.frame object for returning
results[1,1]="none removed"
results[c(2:(NROW(networks.list)+1)),1]=paste("removed",networks.list$network.name, sep=" ")
results=data.frame(results)
results[, c(2:4)]=sapply(results[, c(2:4)], as.numeric)
names(results)=c("lesion.model","positive","negative","both")
return(results)
}
##################################################################################################################
##################################################################################################################
## EXAMPLE:
#source("https://github.com/CogBrainHealthLab/MLtools/blob/main/cpm.r?raw=TRUE")
#cv.model=cpm.train.cv(data=dat, outcome=outcome)
#model=cpm.train(data=dat_FC, outcome=dat_beh$age, p=model$opt.pvals)
#predicted.score=cpm.predict(model = model, test.data=test.dat_FC)