From 96c150ab375fb69c2a465c7291d60bc290f55fea Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Thu, 13 Jul 2023 09:52:04 +0200 Subject: [PATCH] Revisit device controller detection for cgroup v2 The cgroup v2 device controller is not listed in the cgroup.controllers file and is solely available via the kernel's BPF interface. Therefore, k0s sysinfo determines its presence based on the Linux kernel version. This is problematic for old kernels that have many backported features, such as RHEL and consorts. However, it is still possible to detect the device controller by trying to attach a dummy device filter to an empty, temporary cgroup. In case k0s is unable to create the cgroup or attach the device filter because of missing permissions, the presence of the devices controller is simply assumed. See: 0655941 ("Add pre-flight checks and probes module") Signed-off-by: Tom Wieczorek --- go.mod | 6 +- go.sum | 10 +- .../pkg/sysinfo/probes/linux/cgroup_v2.go | 140 +++++++++++++----- 3 files changed, 113 insertions(+), 43 deletions(-) diff --git a/go.mod b/go.mod index 721fc3ff5bd5..8882103d8662 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/avast/retry-go v3.0.0+incompatible github.com/bombsimon/logrusr/v2 v2.0.1 github.com/cavaliergopher/grab/v3 v3.0.1 + github.com/cilium/ebpf v0.11.0 github.com/cloudflare/cfssl v1.6.4 github.com/containerd/cgroups/v3 v3.0.1 github.com/containerd/containerd v1.7.2 @@ -31,6 +32,7 @@ require ( github.com/mitchellh/go-homedir v1.1.0 github.com/olekukonko/tablewriter v0.0.5 github.com/opencontainers/image-spec v1.1.0-rc4 + github.com/opencontainers/runtime-spec v1.1.0-rc.2 github.com/otiai10/copy v1.12.0 github.com/pelletier/go-toml v1.9.5 github.com/robfig/cron v1.2.0 @@ -52,7 +54,7 @@ require ( go.uber.org/multierr v1.11.0 go.uber.org/zap v1.24.0 golang.org/x/crypto v0.11.0 - golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 + golang.org/x/exp v0.0.0-20230711153332-06a737ee72cb golang.org/x/mod v0.12.0 golang.org/x/sync v0.3.0 golang.org/x/sys v0.10.0 @@ -102,7 +104,6 @@ require ( github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chai2010/gettext-go v1.0.2 // indirect - github.com/cilium/ebpf v0.9.1 // indirect github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/console v1.0.3 // indirect github.com/containerd/continuity v0.4.1 // indirect @@ -208,7 +209,6 @@ require ( github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/runc v1.1.7 // indirect - github.com/opencontainers/runtime-spec v1.1.0-rc.2 // indirect github.com/opencontainers/selinux v1.11.0 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect github.com/pkg/errors v0.9.1 // indirect diff --git a/go.sum b/go.sum index 89172ea72d9b..f4d700768e90 100644 --- a/go.sum +++ b/go.sum @@ -137,8 +137,8 @@ github.com/chai2010/gettext-go v1.0.2/go.mod h1:y+wnP2cHYaVj19NZhYKAwEMH2CI1gNHe github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= -github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= -github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= +github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= +github.com/cilium/ebpf v0.11.0/go.mod h1:WE7CZAnqOL2RouJ4f1uyNhqr2P4CCvXFIqdRDUgWsVs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudflare/cfssl v1.6.4 h1:NMOvfrEjFfC63K3SGXgAnFdsgkmiq4kATme5BfcqrO8= github.com/cloudflare/cfssl v1.6.4/go.mod h1:8b3CQMxfWPAeom3zBnGJ6sd+G1NkL5TXqmDXacb+1J0= @@ -260,8 +260,8 @@ github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBd github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0= github.com/foxcpp/go-mockdns v1.0.0 h1:7jBqxd3WDWwi/6WhDvacvH1XsN3rOLXyHM1uhvIx6FI= -github.com/frankban/quicktest v1.14.3 h1:FJKSZTDHjyhriyC81FLQ0LY93eSai0ZyR/ZIkd3ZUKE= github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps= +github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= @@ -993,8 +993,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 h1:tnebWN09GYg9OLPss1KXj8txwZc6X6uMr6VFdcGNbHw= -golang.org/x/exp v0.0.0-20220827204233-334a2380cb91/go.mod h1:cyybsKvd6eL0RnXn6p/Grxp8F5bW7iYuBgsNCOHpMYE= +golang.org/x/exp v0.0.0-20230711153332-06a737ee72cb h1:xIApU0ow1zwMa2uL1VDNeQlNVFTWMQxZUZCMDy0Q4Us= +golang.org/x/exp v0.0.0-20230711153332-06a737ee72cb/go.mod h1:FXUEEKJgO7OQYeo8N01OfiKP8RXMtf6e8aTskBGqWdc= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= diff --git a/internal/pkg/sysinfo/probes/linux/cgroup_v2.go b/internal/pkg/sysinfo/probes/linux/cgroup_v2.go index a0d6b5eb543e..947082ccea58 100644 --- a/internal/pkg/sysinfo/probes/linux/cgroup_v2.go +++ b/internal/pkg/sysinfo/probes/linux/cgroup_v2.go @@ -22,12 +22,17 @@ package linux import ( "errors" "fmt" + "io/fs" "os" "path/filepath" - "regexp" - "strconv" + "strings" + "k8s.io/utils/pointer" + + "github.com/cilium/ebpf/rlimit" "github.com/containerd/cgroups/v3/cgroup2" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" ) type cgroupV2 struct { @@ -54,22 +59,110 @@ func (g *cgroupV2) loadControllers(seen func(string, string)) error { return g.detectListedRootControllers(seen) } -// The device controller has no interface files. Its availability is assumed -// based on the kernel version, as it is hard to detect it directly. +// Detects the device controller by trying to attach a dummy program of type +// BPF_CGROUP_DEVICE to a cgroup. Since the controller has no interface files +// and is implemented purely on top of BPF, this is the only reliable way to +// detect it. A best-guess detection via the kernel version has the major +// drawback of not working with kernels that have a lot of backported features, +// such as RHEL and friends. +// // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#device-controller func (g *cgroupV2) detectDevicesController() (cgroupControllerAvailable, error) { - major, minor, err := parseKernelRelease(g.probeUname) + err := attachDummyDeviceFilter(g.mountPoint) + switch { + case err == nil: + return cgroupControllerAvailable{true, "device filters attachable", ""}, nil + + // EACCES occurs when not allowed to create cgroups. + // EPERM occurs when not allowed to load eBPF programs. + case errors.Is(err, os.ErrPermission) && os.Geteuid() != 0: + return cgroupControllerAvailable{true, "assumed", "insufficient permissions, try with elevated permissions"}, nil + + case eBPFProgramUnsupported(err): + return cgroupControllerAvailable{false, err.Error(), ""}, nil + } + + return cgroupControllerAvailable{}, err +} + +// Attaches a dummy program of type BPF_CGROUP_DEVICE to a randomly created +// cgroup and removes the program and cgroup again. +func attachDummyDeviceFilter(mountPoint string) (err error) { + insts, license, err := cgroup2.DeviceFilter([]specs.LinuxDeviceCgroup{{ + Allow: true, + Type: "a", + Major: pointer.Int64(-1), + Minor: pointer.Int64(-1), + Access: "rwm", + }}) if err != nil { - return cgroupControllerAvailable{}, err + return fmt.Errorf("failed to create eBPF device filter program: %w", err) + } + + tmpCgroupPath, err := os.MkdirTemp(mountPoint, "k0s-devices-detection-*") + if err != nil { + return fmt.Errorf("failed to create temporary cgroup: %w", err) + } + defer func() { err = errors.Join(err, os.Remove(tmpCgroupPath)) }() + + dirFD, err := unix.Open(tmpCgroupPath, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("failed to open temporary cgroup: %w", &fs.PathError{Op: "open", Path: tmpCgroupPath, Err: err}) + } + defer func() { + if closeErr := unix.Close(dirFD); closeErr != nil { + err = errors.Join(err, &fs.PathError{Op: "close", Path: tmpCgroupPath, Err: closeErr}) + } + }() + + close, err := cgroup2.LoadAttachCgroupDeviceFilter(insts, license, dirFD) + if err != nil { + // RemoveMemlock may be required on kernels < 5.11 + // observed on debian 11: 5.10.0-21-armmp-lpae #1 SMP Debian 5.10.162-1 (2023-01-21) armv7l + // https://github.com/cilium/ebpf/blob/v0.11.0/prog.go#L356-L360 + if errors.Is(err, unix.EPERM) && strings.Contains(err.Error(), "RemoveMemlock") { + if err2 := rlimit.RemoveMemlock(); err2 != nil { + err = errors.Join(err, err2) + } else { + // Try again, MEMLOCK should be removed by now. + close, err2 = cgroup2.LoadAttachCgroupDeviceFilter(insts, license, dirFD) + if err2 != nil { + err = errors.Join(err, err2) + } else { + err = nil + } + } + } } + if err != nil { + if eBPFProgramUnsupported(err) { + return err + } + return fmt.Errorf("failed to load/attach eBPF device filter program: %w", err) + } + + return close() +} - // since 4.15 - available, op := false, "<" - if major > 4 || (major == 4 && minor >= 15) { - available, op = true, ">=" +// Returns true if the given error indicates that an eBPF program is unsupported +// by the kernel. +func eBPFProgramUnsupported(err error) bool { + // https://github.com/cilium/ebpf/blob/v0.11.0/features/prog.go#L43-L49 + + switch { + // EINVAL occurs when attempting to create a program with an unknown type. + case errors.Is(err, unix.EINVAL): + return true + + // E2BIG occurs when ProgLoadAttr contains non-zero bytes past the end of + // the struct known by the running kernel, meaning the kernel is too old to + // support the given prog type. + case errors.Is(err, unix.E2BIG): + return true + + default: + return false } - msg := fmt.Sprintf("kernel %d.%d %s 4.15", major, minor, op) - return cgroupControllerAvailable{available, msg, ""}, nil } // Detect the freezer controller. It doesn't appear in the cgroup.controllers @@ -137,26 +230,3 @@ func (g *cgroupV2) detectListedRootControllers(seen func(string, string)) (err e return nil } - -func parseKernelRelease(probeUname unameProber) (int64, int64, error) { - uname, err := probeUname() - if err != nil { - return 0, 0, err - } - - var major, minor int64 - r := regexp.MustCompile(`^(\d+)\.(\d+)(\.|$)`) - if matches := r.FindStringSubmatch(uname.osRelease.value); matches == nil { - err = errors.New("unsupported format") - } else { - if major, err = strconv.ParseInt(matches[1], 10, 16); err == nil { - minor, err = strconv.ParseInt(matches[2], 10, 16) - } - } - - if err != nil { - err = fmt.Errorf("failed to parse kernel release %q: %w", uname.osRelease, err) - } - - return major, minor, err -}