forked from vineet1992/Gene-Selection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
all.R
115 lines (93 loc) · 4.15 KB
/
all.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
library(pathClass)
library(pROC)
##before running this, run loadall.R to set up the datasets and the prior knowledge adjacency matrix
#sampleData = GSE1456
#sampleData = GSE2034
#sampleData = GSE2990
#sampleData = GSE4922
#sampleData = GSE7390
#sampleData = GSE11121
sampleData = rbind(GSE1456,GSE2034,GSE2990,GSE4922,GSE7390,GSE11121)
n = dim(sampleData)[1]
f = floor(n/10)
sampleData = sampleData[sample(1:n,n),]
AUC.nbsvm = c(1:10)
AUC.rrfe = c(1:10)
AUC.diffex = c(1:10)
ad.list <- as.adjacencyList(adjMatrix)
for (i in 0:9){
print(i)
OUT = sampleData[c(1:f+f*i),] #leave one out for testing
IN = sampleData[-c(1:f+f*i),] #use the rest as a training set
colnames(IN) <- col
colnames(OUT) <- col
x_tr <- (IN)
y_tr <- factor(IN$y)
x_te <- (OUT)
y_te <- factor(OUT$y)
##-------------------NBSVM-------------------------------
matched <- matchMatrices(x = x_tr, adjacency = adjMatrix, mapping = mapping)
nbSVM = fit.networkBasedSVM(matched$x, y_tr, DEBUG=FALSE, adjacencyList = ad.list, lambdas = 10^(-1:2),sd.cutoff=1)
m <- matchMatrices(x=x_te, adjacency = adjMatrix, mapping=mapping)
pred = predict(nbSVM, m$x)
print(pred)
A = roc(y_te,pred)
AUC.nbsvm[i+1] = auc(A)
##-------------------RRFE--------------------------------
rrfe = fit.rrfe(x_tr, y_tr, Cs = 10^(-3:3), mapping=mapping, Gsub = adjMatrix, d=1/2)
pred = predict(rrfe, x_te, type="response")
print(pred)
A = roc(y_te,pred)
AUC.rrfe[i+1] = auc(A)
##-------------------t test------------------------------
ind = matrix(,nrow=5,ncol=50) #this will store the top 50 predictors with greatest difference in means between positive and negative samples
for(j in 0:4){
TEST = IN[c(1:s+j*s),] #similarly, leave one out for testing. this time we treat only the training set as if it were the whole data set
TRAIN = IN[-c(1:s+s*j),] #and use rest for training
POS = TRAIN[which(TRAIN[,22284]==1),]
NEG = TRAIN[which(TRAIN[,22284]==0),]
m = c(1:22283)
for (d in 1:22283){ #calculate the difference in means between pos and neg for each predictor
m[d] = abs(mean(POS[,d])-mean(NEG[,d]))
}
ind[j+1,] = order(m, 1:22283, decreasing=TRUE)[1:50] #order their indices highest to lowest, and select only the top 50
}
best_k=0
best_acc=0
for(k in 12:20){
acc = c(0,0,0,0,0) #init a vector to store the AUC for each fold
for (j in 0:4){
m = ind[j+1,1:k]
TRAIN = IN[-c(1:s+s*j),] #these will be the same sets we computed difference of means on, we just did it above so that we didnt have to do it inside the loop. this shiz takes long enough already
TEST = IN[c(1:s+j*s),]
xx = data.frame(TRAIN[,c(22284,m)]) #glm wants a data frame, and predicts the first column on the remaining columns, so put y first and the best predictors (indexed by m) of x
model = glm(xx, family=binomial(link="logit")) #train! #i'm not sure if these are appropriate parameters
xx = data.frame(TEST[,m]) #now we need to exclude the labels; just use the indexed best predictors from the cv test fold
yy = TEST[,22284]
pred = predict(model, xx, type = "response")
A = roc(yy,pred) #create an ROC object
acc[j+1]=auc(A) #get area under curve from it, and record in the array we init'd
}
if(mean(acc)>best_acc){ #once we have an AUC for each cv test fold, compute their average and update the best as needed
best_k=k
best_acc=mean(acc)}
}
#equipped with out best_k, we basically do the same model building and testing as above, but now using the 9 folds from the whole data set to predict the 1 left out fold
POS = IN[which(IN[,22284]==1),]
NEG = IN[which(IN[,22284]==0),]
m = c(1:22283)
for (d in 1:22283) m[d] = abs(mean(POS[,d])-mean(NEG[,d]))
m = order(m, 1:22283, decreasing=TRUE)[1:best_k]
xx = data.frame(IN[,c(22284,m)])
model = glm(xx, family=binomial(link="logit"))
xx = data.frame(OUT[,m])
pred = predict(model, xx, type="response")
print(pred)
yy = OUT[,22284]
A = roc(yy,pred)
AUC.diffex[i+1] = auc(A)
}
AUC = cbind(AUC.nbsvm, AUC.rrfe, AUC.diffex)
png("plot")
boxplot(AUC, ylab = "AUC", ylim = c(0,1), range=1, names = c("nbSVM","RRFE","t-test"))
dev.off()