-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKruskal Wallis.R
115 lines (90 loc) · 3.33 KB
/
Kruskal Wallis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#--------------------
# Kruskal-Wallis test
#--------------------
# 1. example computed manually
# number of samples
k=4
# data
s1 <- c(15.791,12.595,10.405,9.836,8.729,9.608)
s2 <- c(16.193,11.937,11.968,9.376,9.227,8.539)
s3 <- c(11.870,9.400, 9.322, 8.200,8.020,8.671)
s4 <- c(9.681, 9.764, 9.154, 8.330,8.388,7.888)
# total sample size N = sum nj
N <- length(s1)+length(s2)+length(s3)+length(s4)
# matrix containing the ranks Rj in the pooled sample
Rk <- matrix(rank(c(s1,s2,s3,s4)), ncol = k, byrow = FALSE) # vector of ranks in pooled sample
Rk
# mean of the Rj
Rbar <- apply(Rk,2,mean)
Rbar # [1] 17.00 15.50 9.17 8.33 (R_bar_j)
# test statistic
KW <- 12/(N*(N+1)) * sum( 6 * ( Rbar - (N+1)/2 )^2 )
KW # [1] 6.926667
# Tf the sample sizes are large, under H0, K has approximately a chi-squared(k-1) distribution,
# where k is the number of samples.
qchisq(p = 0.95 , df = 3) # [1] 7.814728
# the 95% quantile is 7.814728 and so K = 6.926667 is not in
# the critical region if we take alpha = 5%.
# Thus we don't reject H0.
# 2. using the implemented function in R
# data
s1 <- c(15.791,12.595,10.405,9.836,8.729,9.608)
s2 <- c(16.193,11.937,11.968,9.376,9.227,8.539)
s3 <- c(11.870,9.400, 9.322, 8.200,8.020,8.671)
s4 <- c(9.681, 9.764, 9.154, 8.330,8.388,7.888)
# 0.95 quantile of the Chi-squared distribution
qchisq(p = 0.95 , df = 3)
# [1] 7.814728
# plot chi-squared density for 3 degrees of freedom
set.seed(1986)
par(mar=c(5.1,4.1,4.1,2.1))
draws <- rchisq(10000000, 3)
dens <- density(draws)
plot(dens, xlab=expression(paste(" ", chi^2, )),
xlim=c(0,18),
ylab="PDF",
main="",
cex.main=0.95,)
q95 <- quantile(draws, .95)
q100 <- quantile(draws, 1.00)
x1 <- min(which(dens$x >= q95))
x2 <- max(which(dens$x < q100))
with(dens, polygon(x=c(x[c(x1,x1:x2,x2)]), y= c(0, y[x1:x2], 0), col='darksalmon'))
abline(v = KW)
text(9, 0.12, 'KW = 6.927', cex = 0.8)
text(2.5, 0.05, 'H0', cex = 0.6)
text(9.5, 0.02, 'H1', cex = 0.6)
# test
kruskal.test(list(s1,s2,s3,s4))
# Kruskal-Wallis rank sum test
#
# data: list(s1, s2, s3, s4)
# Kruskal-Wallis chi-squared = 6.9267, df = 3, p-value = 0.07427
# 3. simulate from the exact distribution
# Complementary exercice. Simulate from the exact distribution
# of KW under H0 (which is not exactly a chi-squared)
# we are to simulate 4 samples of the same size 6 with the same distribution (any continuous)
# i.e. normal, uniform... and compute 100000 realizations of the test statistic
kruskal.distribution <- function(nj, nsim) {
N = 4*nj
KW <- rep(0,nsim)
for(i in 1:nsim){
s1 = runif(nj); s2 = runif(nj)
s3 = runif(nj); s4 = runif(nj)
Rk <- matrix(rank(c(s1,s2,s3,s4)), ncol = 4, byrow = FALSE)
Rbarj <- apply(Rk,2,mean)
KW[i] <- 12/(N*(N+1)) * sum(nj *(Rbarj -(N+1)/2)^2 )
}
output <- NULL
output$KW <- KW
output$quantiles <- quantile(x = KW, probs = c(0.05, 0.5, 0.95, 0.99))
return(output)
}
set.seed(2021)
results <- kruskal.distribution(nj = 6, nsim = 100000)
results$quantiles
# 5% 50% 95% 99%
# 0.380000 2.466667 7.446667 10.140000
# we find a 95% quantile of (with simulation) 7.45 which is
# really lower than the previous one (7.814728).
# However the conclusion does not change: we don't reject H0