From 67bc4bc2409faa0137dad48b822e8ce572302991 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 12 May 2023 10:40:14 -0700 Subject: [PATCH 1/7] tests/rootless.sh: drop set -x It seems that set -x was temporarily added as a debug measure, but slipped into the final commit. Remove it, for the sake of test logs brevity. Fixes: 9f656dbb11c42fbe3 Signed-off-by: Kir Kolyshkin --- tests/rootless.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rootless.sh b/tests/rootless.sh index 83be248a108..7f0408508ce 100755 --- a/tests/rootless.sh +++ b/tests/rootless.sh @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash # Copyright (C) 2017 SUSE LLC # # Licensed under the Apache License, Version 2.0 (the "License"); From e0e8d9c8860c2c4b7c844daff3a25884ce2c8ecf Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 11 May 2023 19:47:26 -0700 Subject: [PATCH 2/7] tests/int/kill: add kill -a with host pidns test This is roughly the same as TestPIDHostInitProcessWait in libct/int, except that here we use separate processes to create and to kill a container, so the processes inside a container are not children of "runc kill", and also we hit different codepaths (nonChildProcess.signal rather than initProcess.signal). One other thing is, rootless is also tested. Signed-off-by: Kir Kolyshkin --- tests/integration/kill.bats | 48 +++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index 590ddd5ef07..2a78778bdee 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -29,3 +29,51 @@ function teardown() { runc delete test_busybox [ "$status" -eq 0 ] } + +# This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. +@test "kill --all KILL [host pidns]" { + # kill -a currently requires cgroup freezer. + requires cgroups_freezer + + update_config ' .linux.namespaces -= [{"type": "pid"}]' + set_cgroups_path + if [ $EUID -ne 0 ]; then + # kill --all requires working cgroups. + requires rootless_cgroup + update_config ' .mounts |= map((select(.type == "proc") + | .type = "none" + | .source = "/proc" + | .options = ["rbind", "nosuid", "nodev", "noexec"] + ) // .)' + fi + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # Start a few more processes. + for _ in 1 2 3 4 5; do + __runc exec -d test_busybox sleep 1h + [ "$status" -eq 0 ] + done + + # Get the list of all container processes. + path=$(get_cgroup_path "pids") + pids=$(cat "$path"/cgroup.procs) + echo "pids: $pids" + # Sanity check -- make sure all processes exist. + for p in $pids; do + kill -0 "$p" + done + + runc kill -a test_busybox KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 test_busybox stopped + + # Make sure all processes are gone. + pids=$(cat "$path"/cgroup.procs) || true # OK if cgroup is gone + echo "pids: $pids" + for p in $pids; do + run ! kill -0 "$p" + [[ "$output" = *"No such process" ]] + done +} From 5b8f8712a41514d8777ee2aeb28ea4c2d9bb0d8a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 12 May 2023 11:04:21 -0700 Subject: [PATCH 3/7] libct: signalAllProcesses: remove child reaping There are two very distinct usage scenarios for signalAllProcesses: * when used from the runc binary ("runc kill" command), the processes that it kills are not the children of "runc kill", and so calling wait(2) on each process is totally useless, as it will return ECHLD; * when used from a program that have created the container (such as libcontainer/integration test suite), that program can and should call wait(2), not the signalling code. So, the child reaping code is totally useless in the first case, and should be implemented by the program using libcontainer in the second case. I was not able to track down how this code was added, my best guess is it happened when this code was part of dockerd, which did not have a proper child reaper implemented at that time. Remove it, and add a proper documentation piece. Change the integration test accordingly. PS the first attempt to disable the child reaping code in signalAllProcesses was made in commit bb912eb00c51f, which used a questionable heuristic to figure out whether wait(2) should be called. This heuristic worked for a particular use case, but is not correct in general. While at it: - simplify signalAllProcesses to use unix.Kill; - document (container).Signal. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 12 +++- libcontainer/init_linux.go | 80 ++------------------------- libcontainer/integration/exec_test.go | 13 +++-- 3 files changed, 22 insertions(+), 83 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 3c0857d0604..73264bc90e1 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -357,6 +357,12 @@ func (c *Container) start(process *Process) (retErr error) { return nil } +// Signal sends a specified signal to container's init, or, if all is true, +// true, to all container's processes (as determined by container's cgroup). +// +// Setting all=true is useful when s is SIGKILL and the container does not have +// its own PID namespace. In this scenario, the libcontainer user may be required +// to implement a proper child reaper. func (c *Container) Signal(s os.Signal, all bool) error { c.m.Lock() defer c.m.Unlock() @@ -365,12 +371,16 @@ func (c *Container) Signal(s os.Signal, all bool) error { return err } if all { + sig, ok := s.(unix.Signal) + if !ok { + return errors.New("unsupported signal type") + } if status == Stopped && !c.cgroupManager.Exists() { // Avoid calling signalAllProcesses which may print // a warning trying to freeze a non-existing cgroup. return nil } - return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s)) + return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig)) } // to avoid a PID reuse attack if status == Running || status == Created || status == Paused { diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 52cb7d57374..9bf13d9d5d4 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -11,7 +11,6 @@ import ( "runtime/debug" "strconv" "strings" - "unsafe" "github.com/containerd/console" "github.com/opencontainers/runtime-spec/specs-go" @@ -575,39 +574,9 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { return nil } -const _P_PID = 1 - -//nolint:structcheck,unused -type siginfo struct { - si_signo int32 - si_errno int32 - si_code int32 - // below here is a union; si_pid is the only field we use - si_pid int32 - // Pad to 128 bytes as detailed in blockUntilWaitable - pad [96]byte -} - -// isWaitable returns true if the process has exited false otherwise. -// Its based off blockUntilWaitable in src/os/wait_waitid.go -func isWaitable(pid int) (bool, error) { - si := &siginfo{} - _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) - if e != 0 { - return false, &os.SyscallError{Syscall: "waitid", Err: e} - } - - return si.si_pid != 0, nil -} - // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. -// If s is SIGKILL and subreaper is not enabled then it will wait for each -// process to exit. -// For all other signals it will check if the process is ready to report its -// exit status and only if it is will a wait be performed. -func signalAllProcesses(m cgroups.Manager, s os.Signal) error { - var procs []*os.Process +func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { if err := m.Freeze(configs.Frozen); err != nil { logrus.Warn(err) } @@ -619,55 +588,14 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { return err } for _, pid := range pids { - p, err := os.FindProcess(pid) - if err != nil { - logrus.Warn(err) - continue - } - procs = append(procs, p) - if err := p.Signal(s); err != nil { - logrus.Warn(err) + err := unix.Kill(pid, s) + if err != nil && err != unix.ESRCH { //nolint:errorlint // unix errors are bare + logrus.Warnf("kill %d: %v", pid, err) } } if err := m.Freeze(configs.Thawed); err != nil { logrus.Warn(err) } - subreaper, err := system.GetSubreaper() - if err != nil { - // The error here means that PR_GET_CHILD_SUBREAPER is not - // supported because this code might run on a kernel older - // than 3.4. We don't want to throw an error in that case, - // and we simplify things, considering there is no subreaper - // set. - subreaper = 0 - } - - for _, p := range procs { - if s != unix.SIGKILL { - if ok, err := isWaitable(p.Pid); err != nil { - if !errors.Is(err, unix.ECHILD) { - logrus.Warn("signalAllProcesses: ", p.Pid, err) - } - continue - } else if !ok { - // Not ready to report so don't wait - continue - } - } - - // In case a subreaper has been setup, this code must not - // wait for the process. Otherwise, we cannot be sure the - // current process will be reaped by the subreaper, while - // the subreaper might be waiting for this process in order - // to retrieve its exit code. - if subreaper == 0 { - if _, err := p.Wait(); err != nil { - if !errors.Is(err, unix.ECHILD) { - logrus.Warn("wait: ", err) - } - } - } - } return nil } diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 530faf1aeaa..376dd0f1f3b 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1373,7 +1373,7 @@ func TestPIDHostInitProcessWait(t *testing.T) { process1 := &libcontainer.Process{ Cwd: "/", - Args: []string{"sleep", "100"}, + Args: []string{"sleep", "1h"}, Env: standardEnvironment, Init: true, } @@ -1382,7 +1382,7 @@ func TestPIDHostInitProcessWait(t *testing.T) { process2 := &libcontainer.Process{ Cwd: "/", - Args: []string{"sleep", "100"}, + Args: []string{"sleep", "1h"}, Env: standardEnvironment, Init: false, } @@ -1397,10 +1397,11 @@ func TestPIDHostInitProcessWait(t *testing.T) { t.Fatal("expected Wait to indicate failure") } - // The non-init process must've been killed. - err = process2.Signal(syscall.Signal(0)) - if err == nil || err.Error() != "no such process" { - t.Fatalf("expected process to have been killed: %v", err) + // The non-init process must've also been killed. If not, + // the test will time out. + _, err = process2.Wait() + if err == nil { + t.Fatal("expected Wait to indicate failure") } } From 2a7dcbbb4039f51c21b2051124ff2ac40a785b46 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 12 May 2023 16:04:11 -0700 Subject: [PATCH 4/7] libct: fix shared pidns detection When someone is using libcontainer to start and kill containers from a long lived process (i.e. the same process creates and removes the container), initProcess.wait method is used, which has a kludge to work around killing containers that do not have their own PID namespace. The code that checks for own PID namespace is not entirely correct. To be exact, it does not set sharePidns flag when the host/caller PID namespace is implicitly used. As a result, the above mentioned kludge does not work. Fix the issue, add a test case (which fails without the fix). Signed-off-by: Kir Kolyshkin --- libcontainer/configs/namespaces_syscall.go | 12 ++++++++++++ libcontainer/container_linux.go | 3 +-- libcontainer/integration/exec_test.go | 20 +++++++++++++++----- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/libcontainer/configs/namespaces_syscall.go b/libcontainer/configs/namespaces_syscall.go index 0516dba8d09..543e059aa67 100644 --- a/libcontainer/configs/namespaces_syscall.go +++ b/libcontainer/configs/namespaces_syscall.go @@ -31,3 +31,15 @@ func (n *Namespaces) CloneFlags() uintptr { } return uintptr(flag) } + +// IsPrivate tells whether the namespace of type t is configured as private +// (i.e. it exists and is not shared). +func (n Namespaces) IsPrivate(t NamespaceType) bool { + for _, v := range n { + if v.Type == t { + return v.Path == "" + } + } + // Not found, so implicitly sharing a parent namespace. + return false +} diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 73264bc90e1..59262d3ab22 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -549,7 +549,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l nsMaps[ns.Type] = ns.Path } } - _, sharePidns := nsMaps[configs.NEWPID] data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) if err != nil { return nil, err @@ -594,7 +593,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l container: c, process: p, bootstrapData: data, - sharePidns: sharePidns, + sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), } c.initProcess = init return init, nil diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 376dd0f1f3b..4ac0f466bd9 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1355,16 +1355,26 @@ func TestPIDHost(t *testing.T) { } } -func TestPIDHostInitProcessWait(t *testing.T) { +func TestHostPidnsInitKill(t *testing.T) { + config := newTemplateConfig(t, nil) + // Implicitly use host pid ns. + config.Namespaces.Remove(configs.NEWPID) + testPidnsInitKill(t, config) +} + +func TestSharedPidnsInitKill(t *testing.T) { + config := newTemplateConfig(t, nil) + // Explicitly use host pid ns. + config.Namespaces.Add(configs.NEWPID, "/proc/1/ns/pid") + testPidnsInitKill(t, config) +} + +func testPidnsInitKill(t *testing.T, config *configs.Config) { if testing.Short() { return } - pidns := "/proc/1/ns/pid" - // Run a container with two long-running processes. - config := newTemplateConfig(t, nil) - config.Namespaces.Add(configs.NEWPID, pidns) container, err := newContainer(t, config) ok(t, err) defer func() { From 9583b3d1c297021109081872c52302316ede15b1 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 13 Apr 2023 11:13:13 -0700 Subject: [PATCH 5/7] libct: move killing logic to container.Signal By default, the container has its own PID namespace, and killing (with SIGKILL) its init process from the parent PID namespace also kills all the other processes. Obviously, it does not work that way when the container is sharing its PID namespace with the host or another container, since init is no longer special (it's not PID 1). In this case, killing container's init will result in a bunch of other processes left running (and thus the inability to remove the cgroup). The solution to the above problem is killing all the container processes, not just init. The problem with the current implementation is, the killing logic is implemented in libcontainer's initProcess.wait, and thus only available to libcontainer users, but not the runc kill command (which uses nonChildProcess.kill and does not use wait at all). So, some workarounds exist: - func destroy(c *Container) calls signalAllProcesses; - runc kill implements -a flag. This code became very tangled over time. Let's simplify things by moving the killing all processes from initProcess.wait to container.Signal, and documents the new behavior. In essence, this also makes `runc kill` to automatically kill all container processes when the container does not have its own PID namespace. Document that as well. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 52 ++++++++++++++++----------- libcontainer/integration/exec_test.go | 4 +-- libcontainer/process_linux.go | 5 --- libcontainer/state_linux.go | 7 ---- man/runc-kill.8.md | 4 +-- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 59262d3ab22..1fa7e2b60c1 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) { } // Signal sends a specified signal to container's init, or, if all is true, -// true, to all container's processes (as determined by container's cgroup). +// to all container's processes (as determined by container's cgroup). // -// Setting all=true is useful when s is SIGKILL and the container does not have +// Note all=true is implied when s is SIGKILL and the container does not have // its own PID namespace. In this scenario, the libcontainer user may be required // to implement a proper child reaper. func (c *Container) Signal(s os.Signal, all bool) error { @@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error { } return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig)) } - // to avoid a PID reuse attack - if status == Running || status == Created || status == Paused { - if err := c.initProcess.signal(s); err != nil { - return fmt.Errorf("unable to signal init: %w", err) - } - if status == Paused { - // For cgroup v1, killing a process in a frozen cgroup - // does nothing until it's thawed. Only thaw the cgroup - // for SIGKILL. - if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL { - _ = c.cgroupManager.Freeze(configs.Thawed) - } - } - return nil + + // To avoid a PID reuse attack, don't kill non-running container. + switch status { + case Running, Created, Paused: + default: + return ErrNotRunning } - return ErrNotRunning + + // When a container has its own PID namespace, inside it the init PID + // is 1, and thus it is handled specially by the kernel. In particular, + // killing init with SIGKILL from an ancestor namespace will also kill + // all other processes in that PID namespace (see pid_namespaces(7)). + // + // OTOH, if PID namespace is shared, we should kill all pids to avoid + // leftover processes. + if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { + err = signalAllProcesses(c.cgroupManager, unix.SIGKILL) + } else { + err = c.initProcess.signal(s) + } + if err != nil { + return fmt.Errorf("unable to signal init: %w", err) + } + if status == Paused && s == unix.SIGKILL { + // For cgroup v1, killing a process in a frozen cgroup + // does nothing until it's thawed. Only thaw the cgroup + // for SIGKILL. + _ = c.cgroupManager.Freeze(configs.Thawed) + } + return nil } func (c *Container) createExecFifo() error { @@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l container: c, process: p, bootstrapData: data, - sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), } c.initProcess = init return init, nil @@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig { return cfg } -// Destroy destroys the container, if its in a valid state, after killing any -// remaining running processes. +// Destroy destroys the container, if its in a valid state. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 4ac0f466bd9..e1ee5f7b4ed 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) { err = container.Run(process2) ok(t, err) - // Kill the init process and Wait for it. - err = process1.Signal(syscall.SIGKILL) + // Kill the container. + err = container.Signal(syscall.SIGKILL, false) ok(t, err) _, err = process1.Wait() if err == nil { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index ba2990a951b..48861406dba 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -304,7 +304,6 @@ type initProcess struct { fds []string process *Process bootstrapData io.Reader - sharePidns bool } func (p *initProcess) pid() int { @@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) { func (p *initProcess) wait() (*os.ProcessState, error) { err := p.cmd.Wait() - // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.sharePidns { - _ = signalAllProcesses(p.manager, unix.SIGKILL) - } return p.cmd.ProcessState, err } diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index 4895612e257..442d0baaf5b 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -7,7 +7,6 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -36,12 +35,6 @@ type containerState interface { } func destroy(c *Container) error { - if !c.config.Namespaces.Contains(configs.NEWPID) || - c.config.Namespaces.PathOf(configs.NEWPID) != "" { - if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { - logrus.Warn(err) - } - } err := c.cgroupManager.Destroy() if c.intelRdtManager != nil { if ierr := c.intelRdtManager.Destroy(); err == nil { diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md index 5f0f51ed9b9..684666396bd 100644 --- a/man/runc-kill.8.md +++ b/man/runc-kill.8.md @@ -18,8 +18,8 @@ to list available signals. # OPTIONS **--all**|**-a** : Send the signal to all processes inside the container, rather than -the container's init only. This option is useful when the container does not -have its own PID namespace. +the container's init only. This option is implied when the _signal_ is **KILL** +and the container does not have its own PID namespace. : When this option is set, no error is returned if the container is stopped or does not exist. From f8ad20f500bf75edd86041657ee762bce116f8f5 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 12 May 2023 12:55:17 -0700 Subject: [PATCH 6/7] runc kill: drop -a option As of previous commit, this is implied in a particular scenario. In fact, this is the one and only scenario that justifies the use of -a. Drop the option from the documentation. For backward compatibility, do recognize it, and retain the feature of ignoring the "container is stopped" error when set. Signed-off-by: Kir Kolyshkin --- delete.go | 4 ++-- kill.go | 13 ++++++++++--- libcontainer/container_linux.go | 24 +++++------------------- libcontainer/integration/exec_test.go | 2 +- man/runc-kill.8.md | 11 +---------- tests/integration/kill.bats | 11 ++++++++--- 6 files changed, 27 insertions(+), 38 deletions(-) diff --git a/delete.go b/delete.go index dd3041f8722..682101ccad8 100644 --- a/delete.go +++ b/delete.go @@ -14,10 +14,10 @@ import ( ) func killContainer(container *libcontainer.Container) error { - _ = container.Signal(unix.SIGKILL, false) + _ = container.Signal(unix.SIGKILL) for i := 0; i < 100; i++ { time.Sleep(100 * time.Millisecond) - if err := container.Signal(unix.Signal(0), false); err != nil { + if err := container.Signal(unix.Signal(0)); err != nil { destroy(container) return nil } diff --git a/kill.go b/kill.go index e5b13b1230d..ac1e47a5b19 100644 --- a/kill.go +++ b/kill.go @@ -1,10 +1,12 @@ package main import ( + "errors" "fmt" "strconv" "strings" + "github.com/opencontainers/runc/libcontainer" "github.com/urfave/cli" "golang.org/x/sys/unix" ) @@ -24,8 +26,9 @@ signal to the init process of the "ubuntu01" container: # runc kill ubuntu01 KILL`, Flags: []cli.Flag{ cli.BoolFlag{ - Name: "all, a", - Usage: "send the specified signal to all processes inside the container", + Name: "all, a", + Usage: "(obsoleted, do not use)", + Hidden: true, }, }, Action: func(context *cli.Context) error { @@ -49,7 +52,11 @@ signal to the init process of the "ubuntu01" container: if err != nil { return err } - return container.Signal(signal, context.Bool("all")) + err = container.Signal(signal) + if errors.Is(err, libcontainer.ErrNotRunning) && context.Bool("all") { + err = nil + } + return err }, } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 1fa7e2b60c1..eac17027af8 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -357,32 +357,18 @@ func (c *Container) start(process *Process) (retErr error) { return nil } -// Signal sends a specified signal to container's init, or, if all is true, -// to all container's processes (as determined by container's cgroup). +// Signal sends a specified signal to container's init. // -// Note all=true is implied when s is SIGKILL and the container does not have -// its own PID namespace. In this scenario, the libcontainer user may be required -// to implement a proper child reaper. -func (c *Container) Signal(s os.Signal, all bool) error { +// When s is SIGKILL and the container does not have its own PID namespace, all +// the container's processes are killed. In this scenario, the libcontainer +// user may be required to implement a proper child reaper. +func (c *Container) Signal(s os.Signal) error { c.m.Lock() defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } - if all { - sig, ok := s.(unix.Signal) - if !ok { - return errors.New("unsupported signal type") - } - if status == Stopped && !c.cgroupManager.Exists() { - // Avoid calling signalAllProcesses which may print - // a warning trying to freeze a non-existing cgroup. - return nil - } - return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig)) - } - // To avoid a PID reuse attack, don't kill non-running container. switch status { case Running, Created, Paused: diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index e1ee5f7b4ed..03d16639067 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1400,7 +1400,7 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) { ok(t, err) // Kill the container. - err = container.Signal(syscall.SIGKILL, false) + err = container.Signal(syscall.SIGKILL) ok(t, err) _, err = process1.Wait() if err == nil { diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md index 684666396bd..0d4cb68e2ff 100644 --- a/man/runc-kill.8.md +++ b/man/runc-kill.8.md @@ -4,7 +4,7 @@ **runc-kill** - send a specified signal to container # SYNOPSIS -**runc kill** [**--all**|**-a**] _container-id_ [_signal_] +**runc kill** _container-id_ [_signal_] # DESCRIPTION @@ -15,15 +15,6 @@ A different signal can be specified either by its name (with or without the **SIG** prefix), or its numeric value. Use **kill**(1) with **-l** option to list available signals. -# OPTIONS -**--all**|**-a** -: Send the signal to all processes inside the container, rather than -the container's init only. This option is implied when the _signal_ is **KILL** -and the container does not have its own PID namespace. - -: When this option is set, no error is returned if the container is stopped -or does not exist. - # EXAMPLES The following will send a **KILL** signal to the init process of the diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index 2a78778bdee..8f2f4a7b4e8 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -22,7 +22,12 @@ function teardown() { [ "$status" -eq 0 ] wait_for_container 10 1 test_busybox stopped - # we should ensure kill work after the container stopped + # Check that kill errors on a stopped container. + runc kill test_busybox 0 + [ "$status" -ne 0 ] + [[ "$output" == *"container not running"* ]] + + # Check that -a (now obsoleted) makes kill return no error for a stopped container. runc kill -a test_busybox 0 [ "$status" -eq 0 ] @@ -31,7 +36,7 @@ function teardown() { } # This is roughly the same as TestPIDHostInitProcessWait in libcontainer/integration. -@test "kill --all KILL [host pidns]" { +@test "kill KILL [host pidns]" { # kill -a currently requires cgroup freezer. requires cgroups_freezer @@ -65,7 +70,7 @@ function teardown() { kill -0 "$p" done - runc kill -a test_busybox KILL + runc kill test_busybox KILL [ "$status" -eq 0 ] wait_for_container 10 1 test_busybox stopped From 7d09ba10cc873f3332dd3de5304fbcd6814d72eb Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 12 May 2023 12:57:50 -0700 Subject: [PATCH 7/7] libct: implement support for cgroup.kill Signed-off-by: Kir Kolyshkin --- libcontainer/init_linux.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 9bf13d9d5d4..ed84ab59e86 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -577,6 +577,17 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { + // Use cgroup.kill, if available. + if s == unix.SIGKILL { + if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid. + err := cgroups.WriteFile(p, "cgroup.kill", "1") + if err == nil || !errors.Is(err, os.ErrNotExist) { + return err + } + // Fallback to old implementation. + } + } + if err := m.Freeze(configs.Frozen); err != nil { logrus.Warn(err) }