-
Notifications
You must be signed in to change notification settings - Fork 211
/
Copy pathvalidators.go
367 lines (312 loc) · 16.5 KB
/
validators.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
package e2e
import (
"bytes"
"context"
"fmt"
"net"
"regexp"
"strings"
"time"
"github.com/Azure/agentbaker/e2e/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func ValidateDirectoryContent(ctx context.Context, s *Scenario, path string, files []string) {
command := fmt.Sprintf("ls -la %s", path)
execResult := execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not get directory contents")
for _, file := range files {
require.Contains(s.T, execResult.stdout.String(), file, "expected to find file %s within directory %s, but did not", file, path)
}
}
func ValidateSysctlConfig(ctx context.Context, s *Scenario, customSysctls map[string]string) {
keysToCheck := make([]string, len(customSysctls))
for k := range customSysctls {
keysToCheck = append(keysToCheck, k)
}
execResult := execOnVMForScenarioValidateExitCode(ctx, s, fmt.Sprintf("sysctl %s | sed -E 's/([0-9])\\s+([0-9])/\\1 \\2/g'", strings.Join(keysToCheck, " ")), 0, "systmctl command failed")
for name, value := range customSysctls {
require.Contains(s.T, execResult.stdout.String(), fmt.Sprintf("%s = %v", name, value), "expected to find %s set to %v, but was not", name, value)
}
}
func ValidateNvidiaSMINotInstalled(ctx context.Context, s *Scenario) {
command := "nvidia-smi"
execResult := execOnVMForScenarioValidateExitCode(ctx, s, command, 1, "")
require.Contains(s.T, execResult.stderr.String(), "nvidia-smi: command not found", "expected stderr to contain 'nvidia-smi: command not found', but got %q", execResult.stderr.String())
}
func ValidateNvidiaSMIInstalled(ctx context.Context, s *Scenario) {
command := "nvidia-smi"
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not execute nvidia-smi command")
}
func ValidateNvidiaModProbeInstalled(ctx context.Context, s *Scenario) {
command := "nvidia-modprobe"
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "cound not execute nvidia-modprobe command")
}
func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) {
command := fmt.Sprintf("ls -1q %s | grep -q '^.*$' && true || false", dirName)
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "either could not find expected file, or something went wrong")
}
func ValidateFileHasContent(ctx context.Context, s *Scenario, fileName string, contents string) {
steps := []string{
fmt.Sprintf("ls -la %[1]s", fileName),
fmt.Sprintf("sudo cat %[1]s", fileName),
fmt.Sprintf("(sudo cat %[1]s | grep -q -F -e %[2]q)", fileName, contents),
}
command := makeExecutableCommand(steps)
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not validate file has contents - might mean file does not have contents, might mean something went wrong")
}
func ValidateFileExcludesContent(ctx context.Context, s *Scenario, fileName string, contents string) {
require.NotEqual(s.T, "", contents, "Test setup failure: Can't validate that a file excludes an empty string. Filename: %s", fileName)
steps := []string{
fmt.Sprintf("test -f %[1]s || exit 0", fileName),
fmt.Sprintf("ls -la %[1]s", fileName),
fmt.Sprintf("sudo cat %[1]s", fileName),
fmt.Sprintf("(sudo cat %[1]s | grep -q -v -F -e %[2]q)", fileName, contents),
}
command := makeExecutableCommand(steps)
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not validate file excludes contents - might mean file does have contents, might mean something went wrong")
}
// this function is just used to remove some bash specific tokens so we can echo the command to stdout.
func cleanse(str string) string {
return strings.Replace(str, "'", "", -1)
}
func makeExecutableCommand(steps []string) string {
stepsWithEchos := make([]string, len(steps)*2)
for i, s := range steps {
stepsWithEchos[i*2] = fmt.Sprintf("echo '%s'", cleanse(s))
stepsWithEchos[i*2+1] = s
}
// quote " quotes and $ vars
joinedCommand := strings.Join(stepsWithEchos, " && ")
quotedCommand := strings.Replace(joinedCommand, "'", "'\"'\"'", -1)
command := fmt.Sprintf("bash -c '%s'", quotedCommand)
return command
}
func ServiceCanRestartValidator(ctx context.Context, s *Scenario, serviceName string, restartTimeoutInSeconds int) {
steps := []string{
// Verify the service is active - print the state then verify so we have logs
fmt.Sprintf("(systemctl -n 5 status %s || true)", serviceName),
fmt.Sprintf("systemctl is-active %s", serviceName),
// get the PID of the service, so we can check it's changed
fmt.Sprintf("INITIAL_PID=`sudo pgrep %s`", serviceName),
"echo INITIAL_PID: $INITIAL_PID",
// we use systemctl kill rather than kill -9 because container restrictions stop us sending a kill sig to a process
fmt.Sprintf("sudo systemctl kill %s", serviceName),
// sleep for restartTimeoutInSeconds seconds to give the service time tor restart
fmt.Sprintf("sleep %d", restartTimeoutInSeconds),
// print the status of the service and then verify it is active.
fmt.Sprintf("(systemctl -n 5 status %s || true)", serviceName),
fmt.Sprintf("systemctl is-active %s", serviceName),
// get the PID of the service after restart, so we can check it's changed
fmt.Sprintf("POST_PID=`sudo pgrep %s`", serviceName),
"echo POST_PID: $POST_PID",
// verify the PID has changed.
"if [[ \"$INITIAL_PID\" == \"$POST_PID\" ]]; then echo PID did not change after restart, failing validator. ; exit 1; fi",
}
command := makeExecutableCommand(steps)
execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "command to restart service failed")
}
func ValidateUlimitSettings(ctx context.Context, s *Scenario, ulimits map[string]string) {
ulimitKeys := make([]string, 0, len(ulimits))
for k := range ulimits {
ulimitKeys = append(ulimitKeys, k)
}
command := fmt.Sprintf("systemctl cat containerd.service | grep -E -i '%s'", strings.Join(ulimitKeys, "|"))
execResult := execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not read containerd.service file")
for name, value := range ulimits {
require.Contains(s.T, execResult.stdout.String(), fmt.Sprintf("%s=%v", name, value), "expected to find %s set to %v, but was not", name, value)
}
}
func execOnVMForScenarioOnUnprivilegedPod(ctx context.Context, s *Scenario, cmd string) *podExecResult {
nonHostPod, err := s.Runtime.Cluster.Kube.GetPodNetworkDebugPodForNode(ctx, s.Runtime.KubeNodeName, s.T)
require.NoError(s.T, err, "failed to get non host debug pod name")
execResult, err := execOnUnprivilegedPod(ctx, s.Runtime.Cluster.Kube, nonHostPod.Namespace, nonHostPod.Name, cmd)
require.NoErrorf(s.T, err, "failed to execute command on pod: %v", cmd)
return execResult
}
func execOnVMForScenario(ctx context.Context, s *Scenario, cmd string) *podExecResult {
result, err := execOnVM(ctx, s.Runtime.Cluster.Kube, s.Runtime.VMPrivateIP, s.Runtime.DebugHostPod, string(s.Runtime.SSHKeyPrivate), cmd)
require.NoError(s.T, err, "failed to execute command on VM")
return result
}
func execOnVMForScenarioValidateExitCode(ctx context.Context, s *Scenario, cmd string, expectedExitCode int, additionalErrorMessage string) *podExecResult {
execResult := execOnVMForScenario(ctx, s, cmd)
expectedExitCodeStr := fmt.Sprint(expectedExitCode)
require.Equal(s.T, expectedExitCodeStr, execResult.exitCode, "exec command failed with exit code %q, expected exit code %s\nCommand: %s\nAdditional detail: %s\nSTDOUT:\n%s\n\nSTDERR:\n%s", execResult.exitCode, expectedExitCodeStr, cmd, additionalErrorMessage, execResult.stdout, execResult.stderr)
return execResult
}
func ValidateInstalledPackageVersion(ctx context.Context, s *Scenario, component, version string) {
s.T.Logf("assert %s %s is installed on the VM", component, version)
installedCommand := func() string {
switch s.VHD.OS {
case config.OSUbuntu:
return "apt list --installed"
case config.OSMariner, config.OSAzureLinux:
return "dnf list installed"
default:
s.T.Fatalf("command to get package list isn't implemented for OS %s", s.VHD.OS)
return ""
}
}()
execResult := execOnVMForScenarioValidateExitCode(ctx, s, installedCommand, 0, "could not get package list")
containsComponent := func() bool {
for _, line := range strings.Split(execResult.stdout.String(), "\n") {
if strings.Contains(line, component) && strings.Contains(line, version) {
return true
}
}
return false
}()
if !containsComponent {
s.T.Logf("expected to find %s %s in the installed packages, but did not", component, version)
s.T.Fail()
}
}
func ValidateKubeletNodeIP(ctx context.Context, s *Scenario) {
execResult := execOnVMForScenarioValidateExitCode(ctx, s, "cat /etc/default/kubelet", 0, "could lot read kubelet config")
// Search for "--node-ip" flag and its value.
matches := regexp.MustCompile(`--node-ip=([a-zA-Z0-9.,]*)`).FindStringSubmatch(execResult.stdout.String())
require.NotNil(s.T, matches, "could not find kubelet flag --node-ip")
require.GreaterOrEqual(s.T, len(matches), 2, "could not find kubelet flag --node-ip")
ipAddresses := strings.Split(matches[1], ",") // Could be multiple for dual-stack.
require.GreaterOrEqual(s.T, len(ipAddresses), 1, "expected at least one --node-ip address, but got none")
require.LessOrEqual(s.T, len(ipAddresses), 2, "expected at most two --node-ip addresses, but got %d", len(ipAddresses))
// Check that each IP is a valid address.
for _, ipAddress := range ipAddresses {
require.NotNil(s.T, net.ParseIP(ipAddress), "--node-ip value %q is not a valid IP address", ipAddress)
}
}
func ValidateIMDSRestrictionRule(ctx context.Context, s *Scenario, table string) {
cmd := fmt.Sprintf("iptables -t %s -S | grep -q 'AKS managed: added by AgentBaker ensureIMDSRestriction for IMDS restriction feature'", table)
execOnVMForScenarioValidateExitCode(ctx, s, cmd, 0, "expected to find IMDS restriction rule, but did not")
}
func ValidateMultipleKubeProxyVersionsExist(ctx context.Context, s *Scenario) {
execResult := execOnVMForScenario(ctx, s, "ctr --namespace k8s.io images list | grep kube-proxy | awk '{print $1}' | grep -oE '[0-9]+\\.[0-9]+\\.[0-9]+'")
if execResult.exitCode != "0" {
s.T.Errorf("Failed to list kube-proxy images: %s", execResult.stderr)
return
}
versions := bytes.NewBufferString(strings.TrimSpace(execResult.stdout.String()))
versionMap := make(map[string]struct{})
for _, version := range strings.Split(versions.String(), "\n") {
if version != "" {
versionMap[version] = struct{}{}
}
}
switch len(versionMap) {
case 0:
s.T.Errorf("No kube-proxy versions found.")
case 1:
s.T.Errorf("Only one kube-proxy version exists: %v", versionMap)
default:
s.T.Logf("Multiple kube-proxy versions exist: %v", versionMap)
}
}
func ValidateContainerdWASMShims(ctx context.Context, s *Scenario) {
execResult := execOnVMForScenarioValidateExitCode(ctx, s, "cat /etc/containerd/config.toml", 0, "could not get containerd config content")
expectedShims := []string{
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.spin]`,
`runtime_type = "io.containerd.spin.v2"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.slight]`,
`runtime_type = "io.containerd.slight-v0-3-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.spin-v0-3-0]`,
`runtime_type = "io.containerd.spin-v0-3-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.slight-v0-3-0]`,
`runtime_type = "io.containerd.slight-v0-3-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.spin-v0-5-1]`,
`runtime_type = "io.containerd.spin-v0-5-1.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.slight-v0-5-1]`,
`runtime_type = "io.containerd.slight-v0-5-1.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.spin-v0-8-0]`,
`runtime_type = "io.containerd.spin-v0-8-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.slight-v0-8-0]`,
`runtime_type = "io.containerd.slight-v0-8-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.wws-v0-8-0]`,
`runtime_type = "io.containerd.wws-v0-8-0.v1"`,
`[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.spin-v0-15-1]`,
`runtime_type = "io.containerd.spin.v2"`,
}
for i := 0; i < len(expectedShims); i += 2 {
section := expectedShims[i]
runtimeType := expectedShims[i+1]
require.Contains(s.T, execResult.stdout.String(), section, "expected to find section in containerd config.toml, but it was not found")
require.Contains(s.T, execResult.stdout.String(), runtimeType, "expected to find section in containerd config.toml, but it was not found")
}
}
func ValidateKubeletHasNotStopped(ctx context.Context, s *Scenario) {
command := "journalctl -u kubelet"
execResult := execOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs")
assert.NotContains(s.T, execResult.stdout.String(), "Stopped Kubelet")
assert.Contains(s.T, execResult.stdout.String(), "Started Kubelet")
}
func ValidateServicesDoNotRestartKubelet(ctx context.Context, s *Scenario) {
// grep all filesin /etc/systemd/system/ for /restart\s+kubelet/ and count results
command := "grep -rl 'restart[[:space:]]\\+kubelet' /etc/systemd/system/"
execOnVMForScenarioValidateExitCode(ctx, s, command, 1, "expected to find no services containing 'restart kubelet' in /etc/systemd/system/")
}
// ValidateKubeletHasFlags checks kubelet is started with the right flags and configs.
func ValidateKubeletHasFlags(ctx context.Context, s *Scenario, filePath string) {
execResult := execOnVMForScenarioValidateExitCode(ctx, s, `journalctl -u kubelet`, 0, "could not get kubelet logs")
configFileFlags := fmt.Sprintf("FLAG: --config=\"%s\"", filePath)
require.Containsf(s.T, execResult.stdout.String(), configFileFlags, "expected to find flag %s, but not found", "config")
}
func ValidatePodUsingNVidiaGPU(ctx context.Context, s *Scenario) {
s.T.Logf("validating pod using nvidia GPU")
// NVidia pod can be ready, but resources may not be available yet
// a hacky way to ensure the next pod is schedulable
waitUntilResourceAvailable(ctx, s, "nvidia.com/gpu")
// device can be allocatable, but not healthy
// ugly hack, but I don't see a better solution
time.Sleep(20 * time.Second)
ensurePod(ctx, s, podRunNvidiaWorkload(s))
}
// Waits until the specified resource is available on the given node.
// Returns an error if the resource is not available within the specified timeout period.
func waitUntilResourceAvailable(ctx context.Context, s *Scenario, resourceName string) {
nodeName := s.Runtime.KubeNodeName
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
s.T.Fatalf("context cancelled: %v", ctx.Err())
case <-ticker.C:
node, err := s.Runtime.Cluster.Kube.Typed.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
require.NoError(s.T, err, "failed to get node %q", nodeName)
if isResourceAvailable(node, resourceName) {
s.T.Logf("resource %q is available", resourceName)
return
}
}
}
}
// Checks if the specified resource is available on the node.
func isResourceAvailable(node *corev1.Node, resourceName string) bool {
for rn, quantity := range node.Status.Allocatable {
if rn == corev1.ResourceName(resourceName) && quantity.Cmp(resource.MustParse("1")) >= 0 {
return true
}
}
return false
}
func ValidateContainerd2Properties(ctx context.Context, s *Scenario, versions []string) {
require.Lenf(s.T, versions, 1, "Expected exactly one version for moby-containerd but got %d", len(versions))
// assert versions[0] value starts with '2.'
require.Truef(s.T, strings.HasPrefix(versions[0], "2."), "expected moby-containerd version to start with '2.', got %v", versions[0])
ValidateInstalledPackageVersion(ctx, s, "moby-containerd", versions[0])
// assert that /etc/containerd/config.toml exists and does not contain deprecated properties from 1.7
ValidateFileExcludesContent(ctx, s, "/etc/containerd/config.toml", "CriuPath")
// assert that containerd.server service file does not contain LimitNOFILE
// https://github.com/containerd/containerd/blob/main/docs/containerd-2.0.md#limitnofile-configuration-has-been-removed
ValidateFileExcludesContent(ctx, s, "/etc/systemd/system/containerd.service", "LimitNOFILE")
// nri plugin is enabled by default
ValidateDirectoryContent(ctx, s, "/var/run/nri", []string{"nri.sock"})
}
func ValidateRunc12Properties(ctx context.Context, s *Scenario, versions []string) {
require.Lenf(s.T, versions, 1, "Expected exactly one version for moby-runc but got %d", len(versions))
// assert versions[0] value starts with '1.2.'
require.Truef(s.T, strings.HasPrefix(versions[0], "1.2."), "expected moby-runc version to start with '1.2.', got %v", versions[0])
ValidateInstalledPackageVersion(ctx, s, "moby-runc", versions[0])
}