From b1449fd5102dfd58859c18ce5aae7bbcc980e61d Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 9 Sep 2024 18:25:28 -0700 Subject: [PATCH 1/3] libct: use Namespaces.IsPrivate more In these cases, this is exactly what we want to find out. Slightly improves performance and readability. Signed-off-by: Kir Kolyshkin --- libcontainer/configs/validate/rootless.go | 2 +- libcontainer/specconv/spec_linux.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go index faceea04c09..4507d4495e6 100644 --- a/libcontainer/configs/validate/rootless.go +++ b/libcontainer/configs/validate/rootless.go @@ -34,7 +34,7 @@ func rootlessEUIDMappings(config *configs.Config) error { return errors.New("rootless container requires user namespaces") } // We only require mappings if we are not joining another userns. - if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" { + if config.Namespaces.IsPrivate(configs.NEWUSER) { if len(config.UIDMappings) == 0 { return errors.New("rootless containers requires at least one UID mapping") } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index ae08a827c1f..e7c6faae347 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -415,7 +415,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } config.Namespaces.Add(t, ns.Path) } - if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { + if config.Namespaces.IsPrivate(configs.NEWNET) { config.Networks = []*configs.Network{ { Type: "loopback", @@ -481,7 +481,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { // Only set it if the container will have its own cgroup // namespace and the cgroupfs will be mounted read/write. // - hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == "" + hasCgroupNS := config.Namespaces.IsPrivate(configs.NEWCGROUP) hasRwCgroupfs := false if hasCgroupNS { for _, m := range config.Mounts { From 21c61165572353bbbde13f0208d59d7bbdd36864 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Sep 2024 16:44:40 -0700 Subject: [PATCH 2/3] tests/int: log when teardown starts This aids in failed test analysis by allowing to distinguish the output of various commands being run as part of the test case from the output of teardown command like runc delete. Signed-off-by: Kir Kolyshkin --- tests/integration/helpers.bash | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 76fbab3f5d1..fb8abb9052e 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -727,6 +727,8 @@ function teardown_bundle() { [ ! -v ROOT ] && return 0 # nothing to teardown cd "$INTEGRATION_ROOT" || return + echo "--- teardown ---" >&2 + teardown_recvtty local ct for ct in $(__runc list -q); do From 30f8f51eab4c01ed083a590214da9d17394a9205 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 11 Sep 2024 17:57:01 -0700 Subject: [PATCH 3/3] runc create/run: warn on rootless + shared pidns + no cgroup Shared pid namespace means `runc kill` (or `runc delete -f`) have to kill all container processes, not just init. To do so, it needs a cgroup to read the PIDs from. If there is no cgroup, processes will be leaked, and so such configuration is bad and should not be allowed. To keep backward compatibility, though, let's merely warn about this for now. Alas, the only way to know if cgroup access is available is by returning an error from Manager.Apply. Amend fs cgroup managers to do so (systemd doesn't need it, since v1 can't work with rootless, and cgroup v2 does not have a special rootless case). Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/cgroups.go | 5 +++++ libcontainer/cgroups/fs/fs.go | 5 +++-- libcontainer/cgroups/fs2/fs2.go | 2 +- libcontainer/process_linux.go | 13 ++++++++++++- tests/integration/create.bats | 28 ++++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index 811f2d26e00..53e194c74dc 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -11,6 +11,11 @@ var ( // is not configured to set device rules. ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") + // ErrRootless is returned by [Manager.Apply] when there is an error + // creating cgroup directory, and cgroup.Rootless is set. In general, + // this error is to be ignored. + ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)") + // DevicesSetV1 and DevicesSetV2 are functions to set devices for // cgroup v1 and v2, respectively. Unless // [github.com/opencontainers/runc/libcontainer/cgroups/devices] diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index d2decb127ca..ba15bfc4077 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -105,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool { return false } -func (m *Manager) Apply(pid int) (err error) { +func (m *Manager) Apply(pid int) (retErr error) { m.mu.Lock() defer m.mu.Unlock() @@ -129,6 +129,7 @@ func (m *Manager) Apply(pid int) (err error) { // later by Set, which fails with a friendly error (see // if path == "" in Set). if isIgnorableError(c.Rootless, err) && c.Path == "" { + retErr = cgroups.ErrRootless delete(m.paths, name) continue } @@ -136,7 +137,7 @@ func (m *Manager) Apply(pid int) (err error) { } } - return nil + return retErr } func (m *Manager) Destroy() error { diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index b1be7df5ccc..93f81bf8dae 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -71,7 +71,7 @@ func (m *Manager) Apply(pid int) error { if m.config.Rootless { if m.config.Path == "" { if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { - return nil + return cgroups.ErrRootless } return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 9a2473f716e..493dbeaac5a 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -580,7 +580,18 @@ func (p *initProcess) start() (retErr error) { // cgroup. We don't need to worry about not doing this and not being root // because we'd be using the rootless cgroup manager in that case. if err := p.manager.Apply(p.pid()); err != nil { - return fmt.Errorf("unable to apply cgroup configuration: %w", err) + if errors.Is(err, cgroups.ErrRootless) { + // ErrRootless is to be ignored except when + // the container doesn't have private pidns. + if !p.config.Config.Namespaces.IsPrivate(configs.NEWPID) { + // TODO: make this an error in runc 1.3. + logrus.Warn("Creating a rootless container with no cgroup and no private pid namespace. " + + "Such configuration is strongly discouraged (as it is impossible to properly kill all container's processes) " + + "and will result in an error in a future runc version.") + } + } else { + return fmt.Errorf("unable to apply cgroup configuration: %w", err) + } } if p.intelRdtManager != nil { if err := p.intelRdtManager.Apply(p.pid()); err != nil { diff --git a/tests/integration/create.bats b/tests/integration/create.bats index 938ac8b63dc..f12291ca9d9 100644 --- a/tests/integration/create.bats +++ b/tests/integration/create.bats @@ -81,3 +81,31 @@ function teardown() { testcontainer test_busybox running } + +# https://github.com/opencontainers/runc/issues/4394#issuecomment-2334926257 +@test "runc create [shared pidns + rootless]" { + # Remove pidns so it's shared with the host. + update_config ' .linux.namespaces -= [{"type": "pid"}]' + if [ $EUID -ne 0 ]; then + if rootless_cgroup; then + # Rootless containers have empty cgroup path by default. + set_cgroups_path + fi + # Can't mount real /proc when rootless + no pidns, + # so change it to a bind-mounted one from the host. + update_config ' .mounts |= map((select(.type == "proc") + | .type = "none" + | .source = "/proc" + | .options = ["rbind", "nosuid", "nodev", "noexec"] + ) // .)' + fi + + exp="Such configuration is strongly discouraged" + runc create --console-socket "$CONSOLE_SOCKET" test + [ "$status" -eq 0 ] + if [ $EUID -ne 0 ] && ! rootless_cgroup; then + [[ "$output" = *"$exp"* ]] + else + [[ "$output" != *"$exp"* ]] + fi +}