From 321aa20c49568ea2e2a3f708a50ccffb1bd67c79 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 19 Aug 2023 12:18:08 +1000
Subject: [PATCH 1/8] scripts: add proper 386 and amd64 target triples and
 builds

We need these to match the Makefile detection of the right gcc for
runc-dmz, as well as making sure that everything builds properly for our
cross-i386 tests. While we're at it, add x86 to the list of build
targets for release builds (presumably nobody will use it, but since we
do test builds of this anyway it probably won't hurt).

In addition, clean up the handling of the native architecture build by
treating it the same as any other build (ensuring that building runc
from a different platform will work the same way regardless of the
native architecture). In practice, the build works the same way as
before.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 Dockerfile              | 20 ++++++++++-------
 Makefile                |  2 +-
 script/lib.sh           | 50 +++++++++++++++++++++++++++++++++--------
 script/release_build.sh | 42 +++++++++++++++++-----------------
 script/seccomp.sh       |  9 ++++++--
 5 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9fd29a59371..6fa8752b5e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
 RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
     wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
     && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
+    && dpkg --add-architecture i386 \
     && apt-get update \
     && apt-get install -y --no-install-recommends \
         build-essential \
         criu \
-        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
-        gcc-arm-linux-gnueabi libc-dev-armel-cross \
-        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
-        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
-        gcc-s390x-linux-gnu libc-dev-s390x-cross \
-        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
+        gcc \
+        gcc-multilib \
         curl \
         gawk \
-        gcc \
         gperf \
         iptables \
         jq \
@@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
         sudo \
         uidmap \
         iproute2 \
+    && apt-get install -y --no-install-recommends \
+        libc-dev:i386 libgcc-s1:i386 \
+        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
+        gcc-arm-linux-gnueabi libc-dev-armel-cross \
+        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
+        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
+        gcc-s390x-linux-gnu libc-dev-s390x-cross \
+        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
     && apt-get clean \
     && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list
 
@@ -54,7 +58,7 @@ RUN cd /tmp \
 ARG LIBSECCOMP_VERSION
 COPY script/seccomp.sh script/lib.sh /tmp/script/
 RUN mkdir -p /opt/libseccomp \
-    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
+    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
 ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
 ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
 ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
diff --git a/Makefile b/Makefile
index 0d48fe8c521..4e90e1c7c4f 100644
--- a/Makefile
+++ b/Makefile
@@ -68,7 +68,7 @@ recvtty sd-helper seccompagent fs-idmap:
 static:
 	$(GO_BUILD_STATIC) -o runc .
 
-releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
+releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
 releaseall: release
 
 release: runcimage
diff --git a/script/lib.sh b/script/lib.sh
index 9fee8e29f38..f79dc3c2335 100644
--- a/script/lib.sh
+++ b/script/lib.sh
@@ -1,33 +1,65 @@
 #!/bin/bash
 
+# NOTE: Make sure you keep this file in sync with cc_platform.mk.
+
 # set_cross_vars sets a few environment variables used for cross-compiling,
 # based on the architecture specified in $1.
 function set_cross_vars() {
 	GOARCH="$1" # default, may be overridden below
 	unset GOARM
 
+	PLATFORM=linux-gnu
+	# openSUSE has a custom PLATFORM
+	if grep -iq "ID_LIKE=.*suse" /etc/os-release; then
+		PLATFORM=suse-linux
+		is_suse=1
+	fi
+
 	case $1 in
+	386)
+		# Always use the 64-bit compiler to build the 386 binary, which works
+		# for the more common cross-build method for x86 (namely, the
+		# equivalent of dpkg --add-architecture).
+		local cpu_type
+		if [ -v is_suse ]; then
+			# There is no x86_64-suse-linux-gcc, so use the native one.
+			HOST=
+			cpu_type=i586
+		else
+			HOST=x86_64-${PLATFORM}
+			cpu_type=i686
+		fi
+		CFLAGS="-m32 -march=$cpu_type ${CFLAGS[*]}"
+		;;
+	amd64)
+		if [ -n "${is_suse:-}" ]; then
+			# There is no x86_64-suse-linux-gcc, so use the native one.
+			HOST=
+		else
+			HOST=x86_64-${PLATFORM}
+		fi
+		;;
 	arm64)
-		HOST=aarch64-linux-gnu
+		HOST=aarch64-${PLATFORM}
 		;;
 	armel)
-		HOST=arm-linux-gnueabi
+		HOST=arm-${PLATFORM}eabi
 		GOARCH=arm
 		GOARM=6
 		;;
 	armhf)
-		HOST=arm-linux-gnueabihf
+		HOST=arm-${PLATFORM}eabihf
 		GOARCH=arm
 		GOARM=7
 		;;
 	ppc64le)
-		HOST=powerpc64le-linux-gnu
+		HOST=powerpc64le-${PLATFORM}
 		;;
 	riscv64)
-		HOST=riscv64-linux-gnu
+		HOST=riscv64-${PLATFORM}
 		;;
 	s390x)
-		HOST=s390x-linux-gnu
+		HOST=s390x-${PLATFORM}
 		;;
 	*)
 		echo "set_cross_vars: unsupported architecture: $1" >&2
@@ -35,8 +67,8 @@ function set_cross_vars() {
 		;;
 	esac
 
-	CC=$HOST-gcc
-	STRIP=$HOST-strip
+	CC="${HOST:+$HOST-}gcc"
+	STRIP="${HOST:+$HOST-}strip"
 
-	export HOST GOARM GOARCH CC STRIP
+	export HOST CFLAGS GOARM GOARCH CC STRIP
 }
diff --git a/script/release_build.sh b/script/release_build.sh
index af238628cbd..6c7aee88b23 100755
--- a/script/release_build.sh
+++ b/script/release_build.sh
@@ -60,24 +60,14 @@ function build_project() {
 	# it can reuse cached pkg-config results).
 	local make_args=(COMMIT_NO= EXTRA_FLAGS="-a" EXTRA_LDFLAGS="${ldflags}" static)
 
-	# Build natively.
-	make -C "$root" \
-		PKG_CONFIG_PATH="$seccompdir/lib/pkgconfig" \
-		"${make_args[@]}"
-	strip "$root/$project"
-	# Sanity check: make sure libseccomp version is as expected.
-	local ver
-	ver=$("$root/$project" --version | awk '$1 == "libseccomp:" {print $2}')
-	if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then
-		echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver"
-		exit 1
-	fi
+	# Save the original cflags.
+	local original_cflags="${CFLAGS:-}"
 
-	mv "$root/$project" "$builddir/$project.$native_arch"
-
-	# Cross-build for for other architectures.
+	# Build for all requested architectures.
 	local arch
 	for arch in "${arches[@]}"; do
+		# Reset CFLAGS.
+		CFLAGS="$original_cflags"
 		set_cross_vars "$arch"
 		make -C "$root" \
 			PKG_CONFIG_PATH="$seccompdir/$arch/lib/pkgconfig" \
@@ -86,6 +76,14 @@ function build_project() {
 		mv "$root/$project" "$builddir/$project.$arch"
 	done
 
+	# Sanity check: make sure libseccomp version is as expected.
+	local ver
+	ver=$("$builddir/$project.$native_arch" --version | awk '$1 == "libseccomp:" {print $2}')
+	if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then
+		echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver"
+		exit 1
+	fi
+
 	# Copy libseccomp source tarball.
 	cp "$seccompdir"/src/* "$builddir"
 
@@ -122,12 +120,17 @@ commit="HEAD"
 version=""
 releasedir=""
 hashcmd=""
-declare -a add_arches
+# Always build a native binary.
+native_arch="$(go env GOARCH || echo "amd64")"
+arches=("$native_arch")
 
 while getopts "a:c:H:hr:v:" opt; do
 	case "$opt" in
 	a)
-		add_arches+=("$OPTARG")
+		# Add architecture if not already present in arches.
+		if ! (printf "%s\0" "${arches[@]}" | grep -zqxF "$OPTARG"); then
+			arches+=("$OPTARG")
+		fi
 		;;
 	c)
 		commit="$OPTARG"
@@ -158,9 +161,8 @@ done
 version="${version:-$(<"$root/VERSION")}"
 releasedir="${releasedir:-release/$version}"
 hashcmd="${hashcmd:-sha256sum}"
-native_arch="$(go env GOARCH || echo "amd64")"
 # Suffixes of files to checksum/sign.
-suffixes=("$native_arch" "${add_arches[@]}" tar.xz)
+suffixes=("${arches[@]}" tar.xz)
 
 log "creating $project release in '$releasedir'"
 log "  version: $version"
@@ -174,7 +176,7 @@ set -x
 rm -rf "$releasedir" && mkdir -p "$releasedir"
 
 # Build project.
-build_project "$releasedir/$project" "$native_arch" "${add_arches[@]}"
+build_project "$releasedir/$project" "$native_arch" "${arches[@]}"
 
 # Generate new archive.
 git archive --format=tar --prefix="$project-$version/" "$commit" | xz >"$releasedir/$project.tar.xz"
diff --git a/script/seccomp.sh b/script/seccomp.sh
index beea612ac83..955437c2fb4 100755
--- a/script/seccomp.sh
+++ b/script/seccomp.sh
@@ -33,16 +33,21 @@ function build_libseccomp() {
 	tar xf "$tar" -C "$srcdir"
 	pushd "$srcdir/libseccomp-$ver" || return
 
-	# Build natively and install to /usr/local.
+	# Install native version for Dockerfile builds.
 	./configure \
 		--prefix="$dest" --libdir="$dest/lib" \
 		--enable-static --enable-shared
 	make install
 	make clean
 
-	# Build and install for additional architectures.
+	# Save the original cflags.
+	local original_cflags="${CFLAGS:-}"
+
+	# Build and install for all requested architectures.
 	local arch
 	for arch in "${arches[@]}"; do
+		# Reset CFLAGS.
+		CFLAGS="$original_cflags"
 		set_cross_vars "$arch"
 		./configure --host "$HOST" \
 			--prefix="$dest/$arch" --libdir="$dest/$arch/lib" \

From 0e9a3358f84813120b38f01cd8ad153ddf3a1694 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 4 Aug 2023 11:59:31 +1000
Subject: [PATCH 2/8] nsexec: migrate memfd /proc/self/exe logic to Go code

This allow us to remove the amount of C code in runc quite
substantially, as well as removing a whole execve(2) from the nsexec
path because we no longer spawn "runc init" only to re-exec "runc init"
after doing the clone.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/container_linux.go         |  74 +++-
 libcontainer/dmz/cloned_binary_linux.go | 192 ++++++++
 libcontainer/nsenter/cloned_binary.c    | 567 ------------------------
 libcontainer/nsenter/nsexec.c           |  11 -
 libcontainer/process.go                 |  12 +
 libcontainer/system/linux.go            |  28 ++
 6 files changed, 286 insertions(+), 598 deletions(-)
 create mode 100644 libcontainer/dmz/cloned_binary_linux.go
 delete mode 100644 libcontainer/nsenter/cloned_binary.c

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index c941239b841..9eb72f86600 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/dmz"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
 	"github.com/opencontainers/runc/libcontainer/utils"
@@ -316,6 +317,8 @@ func (c *Container) start(process *Process) (retErr error) {
 	if err != nil {
 		return fmt.Errorf("unable to create new parent process: %w", err)
 	}
+	// We do not need the cloned binaries once the process is spawned.
+	defer process.closeClonedExes()
 
 	logsDone := parent.forwardChildLogs()
 	if logsDone != nil {
@@ -454,24 +457,30 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	}
 	logFilePair := filePair{parentLogPipe, childLogPipe}
 
-	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
-	if !p.Init {
-		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
+	// Make sure we use a new safe copy of /proc/self/exe each time this is
+	// called, to make sure that if a container manages to overwrite the file
+	// it cannot affect other containers on the system. For runc, this code
+	// will only ever be called once, but libcontainer users might call this
+	// more than once.
+	p.closeClonedExes()
+	var (
+		exePath string
+		safeExe *os.File
+	)
+	if dmz.IsSelfExeCloned() {
+		// /proc/self/exe is already a cloned binary -- no need to do anything
+		logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
+		exePath = "/proc/self/exe"
+	} else {
+		safeExe, err = dmz.CloneSelfExe(c.root)
+		if err != nil {
+			return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
+		}
+		exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
+		p.clonedExes = append(p.clonedExes, safeExe)
 	}
 
-	// We only set up fifoFd if we're not doing a `runc exec`. The historic
-	// reason for this is that previously we would pass a dirfd that allowed
-	// for container rootfs escape (and not doing it in `runc exec` avoided
-	// that problem), but we no longer do that. However, there's no need to do
-	// this for `runc exec` so we just keep it this way to be safe.
-	if err := c.includeExecFifo(cmd); err != nil {
-		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
-	}
-	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
-}
-
-func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
-	cmd := exec.Command("/proc/self/exe", "init")
+	cmd := exec.Command(exePath, "init")
 	cmd.Args[0] = os.Args[0]
 	cmd.Stdin = p.Stdin
 	cmd.Stdout = p.Stdout
@@ -501,13 +510,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
 		cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
 	}
 
-	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
-	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
-	// even with the parent still running.
+	if safeExe != nil {
+		// Due to a Go stdlib bug, we need to add safeExe to the set of
+		// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
+		// during forkAndExecInChild1 and replace it with some other file that
+		// might be malicious. This is less than ideal (because the descriptor
+		// will be non-O_CLOEXEC) however we have protections in "runc init" to
+		// stop us from leaking extra file descriptors.
+		//
+		// See <https://github.com/golang/go/issues/61751>.
+		cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
+	}
+
+	// NOTE: when running a container with no PID namespace and the parent
+	//       process spawning the container is PID1 the pdeathsig is being
+	//       delivered to the container's init process by the kernel for some
+	//       reason even with the parent still running.
 	if c.config.ParentDeathSignal > 0 {
 		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 	}
-	return cmd
+
+	if p.Init {
+		// We only set up fifoFd if we're not doing a `runc exec`. The historic
+		// reason for this is that previously we would pass a dirfd that allowed
+		// for container rootfs escape (and not doing it in `runc exec` avoided
+		// that problem), but we no longer do that. However, there's no need to do
+		// this for `runc exec` so we just keep it this way to be safe.
+		if err := c.includeExecFifo(cmd); err != nil {
+			return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+		}
+		return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
+	}
+	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
 // shouldSendMountSources says whether the child process must setup bind mounts with
diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
new file mode 100644
index 00000000000..c962ea6fcba
--- /dev/null
+++ b/libcontainer/dmz/cloned_binary_linux.go
@@ -0,0 +1,192 @@
+package dmz
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+type SealFunc func(**os.File) error
+
+var (
+	_ SealFunc = sealMemfd
+	_ SealFunc = sealFile
+)
+
+const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
+
+func sealMemfd(f **os.File) error {
+	if err := (*f).Chmod(0o511); err != nil {
+		return err
+	}
+	// Try to set the newer memfd sealing flags, but we ignore
+	// errors because they are not needed and we want to continue
+	// to work on older kernels.
+	fd := (*f).Fd()
+	// F_SEAL_FUTURE_WRITE -- Linux 5.1
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+	// F_SEAL_EXEC -- Linux 6.3
+	const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+	// Apply all original memfd seals.
+	_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
+	return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+}
+
+// Memfd creates a sealable executable memfd (supported since Linux 3.17).
+func Memfd(comment string) (*os.File, SealFunc, error) {
+	file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+	return file, sealMemfd, err
+}
+
+func sealFile(f **os.File) error {
+	if err := (*f).Chmod(0o511); err != nil {
+		return err
+	}
+	// When sealing an O_TMPFILE-style descriptor we need to
+	// re-open the path as O_PATH to clear the existing write
+	// handle we have.
+	opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("reopen tmpfile: %w", err)
+	}
+	_ = (*f).Close()
+	*f = opath
+	return nil
+}
+
+// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
+// since Linux 3.11).
+func otmpfile(dir string) (*os.File, SealFunc, error) {
+	file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+	if err != nil {
+		return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
+	}
+	// Make sure we actually got an unlinked O_TMPFILE descriptor.
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		file.Close()
+		return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
+	} else if stat.Nlink != 0 {
+		file.Close()
+		return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
+	}
+	return file, sealFile, err
+}
+
+// mktemp creates a classic unlinked file in the given directory.
+func mktemp(dir string) (*os.File, SealFunc, error) {
+	file, err := os.CreateTemp(dir, "runc.")
+	if err != nil {
+		return nil, nil, err
+	}
+	// Unlink the file and verify it was unlinked.
+	if err := os.Remove(file.Name()); err != nil {
+		return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
+	}
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
+	} else if stat.Nlink != 0 {
+		return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
+	}
+	return file, sealFile, err
+}
+
+func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
+	// First, try an executable memfd (supported since Linux 3.17).
+	file, sealFn, err = Memfd(comment)
+	if err == nil {
+		return
+	}
+	logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+	// Try to fallback to O_TMPFILE (supported since Linux 3.11).
+	file, sealFn, err = otmpfile(tmpDir)
+	if err == nil {
+		return
+	}
+	logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+	// Finally, try a classic unlinked temporary file.
+	file, sealFn, err = mktemp(tmpDir)
+	if err == nil {
+		return
+	}
+	return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
+}
+
+// CloneBinary creates a "sealed" clone of a given binary, which can be used to
+// thwart attempts by the container process to gain access to host binaries
+// through procfs magic-link shenanigans. For more details on why this is
+// necessary, see CVE-2019-5736.
+func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
+	logrus.Debugf("cloning %s binary (%d bytes)", name, size)
+	file, sealFn, err := getSealableFile(name, tmpDir)
+	if err != nil {
+		return nil, err
+	}
+	copied, err := io.Copy(file, src)
+	if err != nil {
+		file.Close()
+		return nil, fmt.Errorf("copy binary: %w", err)
+	} else if copied != size {
+		file.Close()
+		return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+	}
+	if err := sealFn(&file); err != nil {
+		file.Close()
+		return nil, fmt.Errorf("could not seal fd: %w", err)
+	}
+	return file, nil
+}
+
+// IsCloned returns whether the given file can be guaranteed to be a safe exe.
+func IsCloned(exe *os.File) bool {
+	seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
+	if err != nil {
+		// /proc/self/exe is probably not a memfd
+		logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
+		return false
+	}
+	// The memfd must have all of the base seals applied.
+	logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
+	return seals&baseMemfdSeals == baseMemfdSeals
+}
+
+// CloneSelfExe makes a clone of the current process's binary (through
+// /proc/self/exe). This binary can then be used for "runc init" in order to
+// make sure the container process can never resolve the original runc binary.
+// For more details on why this is necessary, see CVE-2019-5736.
+func CloneSelfExe(tmpDir string) (*os.File, error) {
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		return nil, fmt.Errorf("opening current binary: %w", err)
+	}
+	defer selfExe.Close()
+
+	stat, err := selfExe.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
+	}
+	size := stat.Size()
+
+	return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
+}
+
+// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
+// be guaranteed to be safe. This means that it must be a sealed memfd. Other
+// types of clones cannot be completely verified as safe.
+func IsSelfExeCloned() bool {
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		logrus.Debugf("open /proc/self/exe failed: %v", err)
+		return false
+	}
+	defer selfExe.Close()
+	return IsCloned(selfExe)
+}
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
deleted file mode 100644
index a7f992fddd7..00000000000
--- a/libcontainer/nsenter/cloned_binary.c
+++ /dev/null
@@ -1,567 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
-/*
- * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
- * Copyright (C) 2019 SUSE LLC
- *
- * This work is dual licensed under the following licenses. You may use,
- * redistribute, and/or modify the work under the conditions of either (or
- * both) licenses.
- *
- * === Apache-2.0 ===
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * === LGPL-2.1-or-later ===
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see
- * <https://www.gnu.org/licenses/>.
- *
- */
-
-#define _GNU_SOURCE
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sched.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/vfs.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <sys/sendfile.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <sys/wait.h>
-
-#include "ipc.h"
-#include "log.h"
-
-/* Use our own wrapper for memfd_create. */
-#ifndef SYS_memfd_create
-#  ifdef __NR_memfd_create
-#    define SYS_memfd_create __NR_memfd_create
-#  else
-/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
-#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
-#    if defined(__x86_64__)
-#      define SYS_memfd_create 319
-#    elif defined(__i386__)
-#      define SYS_memfd_create 356
-#    elif defined(__ia64__)
-#      define SYS_memfd_create 1340
-#    elif defined(__arm__)
-#      define SYS_memfd_create 385
-#    elif defined(__aarch64__)
-#      define SYS_memfd_create 279
-#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
-#      define SYS_memfd_create 360
-#    elif defined(__s390__) || defined(__s390x__)
-#      define SYS_memfd_create 350
-#    else
-#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
-#    endif
-#  endif
-#endif
-
-/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
-#ifndef MFD_CLOEXEC
-#  define MFD_CLOEXEC       0x0001U
-#  define MFD_ALLOW_SEALING 0x0002U
-#endif
-#ifndef MFD_EXEC
-#  define MFD_EXEC          0x0010U
-#endif
-
-int memfd_create(const char *name, unsigned int flags)
-{
-#ifdef SYS_memfd_create
-	return syscall(SYS_memfd_create, name, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
-}
-
-/* This comes directly from <linux/fcntl.h>. */
-#ifndef F_LINUX_SPECIFIC_BASE
-#  define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#ifndef F_ADD_SEALS
-#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
-#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
-#endif
-#ifndef F_SEAL_SEAL
-#  define F_SEAL_SEAL          0x0001	/* prevent further seals from being set */
-#  define F_SEAL_SHRINK        0x0002	/* prevent file from shrinking */
-#  define F_SEAL_GROW          0x0004	/* prevent file from growing */
-#  define F_SEAL_WRITE         0x0008	/* prevent writes */
-#endif
-#ifndef F_SEAL_FUTURE_WRITE
-#  define F_SEAL_FUTURE_WRITE  0x0010	/* prevent future writes while mapped */
-#endif
-#ifndef F_SEAL_EXEC
-#  define F_SEAL_EXEC          0x0020	/* prevent chmod modifying exec bits */
-#endif
-
-#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
-#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
-/*
- * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
- * which we use opportunistically. However, this set is the original set of
- * memfd seals, and we require them all to be set to trust our /proc/self/exe
- * if it is a memfd.
- */
-#define RUNC_MEMFD_MIN_SEALS \
-	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
-
-static void *must_realloc(void *ptr, size_t size)
-{
-	void *old = ptr;
-	do {
-		ptr = realloc(old, size);
-	} while (!ptr);
-	return ptr;
-}
-
-/*
- * Verify whether we are currently in a self-cloned program (namely, is
- * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
- * for shmem files), and we want to be sure it's actually sealed.
- */
-static int is_self_cloned(void)
-{
-	int fd, seals = 0, is_cloned = false;
-	struct stat statbuf = { };
-	struct statfs fsbuf = { };
-
-	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
-		return -ENOTRECOVERABLE;
-	}
-
-	/*
-	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
-	 * this, because you cannot write to a sealed memfd no matter what.
-	 */
-	seals = fcntl(fd, F_GET_SEALS);
-	if (seals >= 0) {
-		write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
-		is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
-		if (is_cloned)
-			goto out;
-	}
-
-	/*
-	 * All other forms require CLONED_BINARY_ENV, since they are potentially
-	 * writeable (or we can't tell if they're fully safe) and thus we must
-	 * check the environment as an extra layer of defence.
-	 */
-	if (!getenv(CLONED_BINARY_ENV)) {
-		is_cloned = false;
-		goto out;
-	}
-
-	/*
-	 * Is the binary on a read-only filesystem? We can't detect bind-mounts in
-	 * particular (in-kernel they are identical to regular mounts) but we can
-	 * at least be sure that it's read-only. In addition, to make sure that
-	 * it's *our* bind-mount we check CLONED_BINARY_ENV.
-	 */
-	if (fstatfs(fd, &fsbuf) >= 0)
-		is_cloned |= (fsbuf.f_flags & MS_RDONLY);
-
-	/*
-	 * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
-	 * which appears to have a borked backport of F_GET_SEALS. Either way,
-	 * having a file which has no hardlinks indicates that we aren't using
-	 * a host-side "runc" binary and this is something that a container
-	 * cannot fake (because unlinking requires being able to resolve the
-	 * path that you want to unlink).
-	 */
-	if (fstat(fd, &statbuf) >= 0)
-		is_cloned |= (statbuf.st_nlink == 0);
-
-out:
-	close(fd);
-	return is_cloned;
-}
-
-/* Read a given file into a new buffer, and providing the length. */
-static char *read_file(char *path, size_t *length)
-{
-	int fd;
-	char buf[4096], *copy = NULL;
-
-	if (!length)
-		return NULL;
-
-	fd = open(path, O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
-		return NULL;
-
-	*length = 0;
-	for (;;) {
-		ssize_t n;
-
-		n = read(fd, buf, sizeof(buf));
-		if (n < 0)
-			goto error;
-		if (!n)
-			break;
-
-		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
-		memcpy(copy + *length, buf, n);
-		*length += n;
-	}
-	close(fd);
-	return copy;
-
-error:
-	close(fd);
-	free(copy);
-	return NULL;
-}
-
-/*
- * A poor-man's version of "xargs -0". Basically parses a given block of
- * NUL-delimited data, within the given length and adds a pointer to each entry
- * to the array of pointers.
- */
-static int parse_xargs(char *data, int data_length, char ***output)
-{
-	int num = 0;
-	char *cur = data;
-
-	if (!data || *output != NULL)
-		return -1;
-
-	while (cur < data + data_length) {
-		num++;
-		*output = must_realloc(*output, (num + 1) * sizeof(**output));
-		(*output)[num - 1] = cur;
-		cur += strlen(cur) + 1;
-	}
-	(*output)[num] = NULL;
-	return num;
-}
-
-/*
- * "Parse" out argv from /proc/self/cmdline.
- * This is necessary because we are running in a context where we don't have a
- * main() that we can just get the arguments from.
- */
-static int fetchve(char ***argv)
-{
-	char *cmdline = NULL;
-	size_t cmdline_size;
-
-	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
-	if (!cmdline)
-		goto error;
-
-	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
-		goto error;
-
-	return 0;
-
-error:
-	free(cmdline);
-	return -EINVAL;
-}
-
-enum {
-	EFD_NONE = 0,
-	EFD_MEMFD,
-	EFD_FILE,
-};
-
-/*
- * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
- * changes depending on the architecture. If we don't have O_TMPFILE we always
- * have the mkostemp(3) fallback.
- */
-#ifndef O_TMPFILE
-#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
-#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#  endif
-#endif
-
-static inline bool is_memfd_unsupported_error(int err)
-{
-	/*
-	 * - ENOSYS is obviously an "unsupported" error.
-	 *
-	 * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
-	 *   but it can also be hit if vm.memfd_noexec=2 (in kernels without
-	 *   [1] applied) and the flags does not contain MFD_EXEC. However,
-	 *   there was a bug in the original 6.3 implementation of
-	 *   vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
-	 *   the "strict" mode. Because we try MFD_EXEC first, we won't get
-	 *   EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
-	 *   figure out whether to log the message about memfd_create).
-	 *
-	 * - EACCES is returned in kernels that contain [1] in the
-	 *   vm.memfd_noexec=2 case.
-	 *
-	 * At time of writing, [1] is not in Linus's tree and it't not clear if
-	 * it will be backported to stable, so what exact versions apply here
-	 * is unclear. But the bug is present in 6.3-6.5 at the very least.
-	 *
-	 * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
-	 */
-	if (err == EACCES)
-		write_log(INFO,
-			  "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
-	return err == ENOSYS || err == EINVAL || err == EACCES;
-}
-
-static int make_execfd(int *fdtype)
-{
-	int fd = -1;
-	char template[PATH_MAX] = { 0 };
-	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
-	if (!prefix || *prefix != '/')
-		prefix = "/tmp";
-	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
-		return -1;
-
-	/*
-	 * Now try memfd, it's much nicer than actually creating a file in STATEDIR
-	 * since it's easily detected thanks to sealing and also doesn't require
-	 * assumptions about STATEDIR.
-	 */
-	*fdtype = EFD_MEMFD;
-	/*
-	 * On newer kernels we should set MFD_EXEC to indicate we need +x
-	 * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
-	 * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
-	 * comment in is_memfd_unsupported_error() -- the upshot is that doing it
-	 * this way works, but only because of two overlapping bugs in the sysctl
-	 * implementation.
-	 */
-	fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd < 0 && is_memfd_unsupported_error(errno))
-		fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd >= 0)
-		return fd;
-	if (!is_memfd_unsupported_error(errno))
-		goto error;
-
-#ifdef O_TMPFILE
-	/*
-	 * Try O_TMPFILE to avoid races where someone might snatch our file. Note
-	 * that O_EXCL isn't actually a security measure here (since you can just
-	 * fd re-open it and clear O_EXCL).
-	 */
-	*fdtype = EFD_FILE;
-	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
-	if (fd >= 0) {
-		struct stat statbuf = { };
-		bool working_otmpfile = false;
-
-		/*
-		 * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
-		 * found this out too. As a result we can't check for EINVAL. However,
-		 * if we get nlink != 0 (or EISDIR) then we know that this kernel
-		 * doesn't support O_TMPFILE.
-		 */
-		if (fstat(fd, &statbuf) >= 0)
-			working_otmpfile = (statbuf.st_nlink == 0);
-
-		if (working_otmpfile)
-			return fd;
-
-		/* Pretend that we got EISDIR since O_TMPFILE failed. */
-		close(fd);
-		errno = EISDIR;
-	}
-	if (errno != EISDIR)
-		goto error;
-#endif /* defined(O_TMPFILE) */
-
-	/*
-	 * Our final option is to create a temporary file the old-school way, and
-	 * then unlink it so that nothing else sees it by accident.
-	 */
-	*fdtype = EFD_FILE;
-	fd = mkostemp(template, O_CLOEXEC);
-	if (fd >= 0) {
-		if (unlink(template) >= 0)
-			return fd;
-		close(fd);
-	}
-
-error:
-	*fdtype = EFD_NONE;
-	return -1;
-}
-
-static int seal_execfd(int *fd, int fdtype)
-{
-	switch (fdtype) {
-	case EFD_MEMFD:{
-			/*
-			 * Try to seal with newer seals, but we ignore errors because older
-			 * kernels don't support some of them. For container security only
-			 * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
-			 * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
-			 * contains F_SEAL_SEAL.
-			 */
-			int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE);	// Linux 5.1
-			int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC);	// Linux 6.3
-			return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
-		}
-	case EFD_FILE:{
-			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-			int newfd;
-			char fdpath[PATH_MAX] = { 0 };
-
-			if (fchmod(*fd, 0100) < 0)
-				return -1;
-
-			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-				return -1;
-
-			newfd = open(fdpath, O_PATH | O_CLOEXEC);
-			if (newfd < 0)
-				return -1;
-
-			close(*fd);
-			*fd = newfd;
-			return 0;
-		}
-	default:
-		break;
-	}
-	return -1;
-}
-
-static ssize_t fd_to_fd(int outfd, int infd)
-{
-	ssize_t total = 0;
-	char buffer[4096];
-
-	for (;;) {
-		ssize_t nread, nwritten = 0;
-
-		nread = read(infd, buffer, sizeof(buffer));
-		if (nread < 0)
-			return -1;
-		if (!nread)
-			break;
-
-		do {
-			ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
-			if (n < 0)
-				return -1;
-			nwritten += n;
-		} while (nwritten < nread);
-
-		total += nwritten;
-	}
-
-	return total;
-}
-
-static int clone_binary(void)
-{
-	int binfd, execfd;
-	struct stat statbuf = { };
-	size_t sent = 0;
-	int fdtype = EFD_NONE;
-
-	execfd = make_execfd(&fdtype);
-	if (execfd < 0 || fdtype == EFD_NONE)
-		return -ENOTRECOVERABLE;
-
-	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (binfd < 0)
-		goto error;
-
-	if (fstat(binfd, &statbuf) < 0)
-		goto error_binfd;
-
-	while (sent < statbuf.st_size) {
-		int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
-		if (n < 0) {
-			/* sendfile can fail so we fallback to a dumb user-space copy. */
-			n = fd_to_fd(execfd, binfd);
-			if (n < 0)
-				goto error_binfd;
-		}
-		sent += n;
-	}
-	close(binfd);
-	if (sent != statbuf.st_size)
-		goto error;
-
-	if (seal_execfd(&execfd, fdtype) < 0)
-		goto error;
-
-	return execfd;
-
-error_binfd:
-	close(binfd);
-error:
-	close(execfd);
-	return -EIO;
-}
-
-/* Get cheap access to the environment. */
-extern char **environ;
-
-int ensure_cloned_binary(void)
-{
-	int execfd;
-	char **argv = NULL;
-
-	/* Check that we're not self-cloned, and if we are then bail. */
-	int cloned = is_self_cloned();
-	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
-		return cloned;
-
-	if (fetchve(&argv) < 0)
-		return -EINVAL;
-
-	execfd = clone_binary();
-	if (execfd < 0)
-		return -EIO;
-
-	if (putenv(CLONED_BINARY_ENV "=1"))
-		goto error;
-
-	fexecve(execfd, argv, environ);
-error:
-	close(execfd);
-	return -ENOEXEC;
-}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 17e0468c6af..9b10b232528 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -536,9 +536,6 @@ void join_namespaces(char *nslist)
 	free(namespaces);
 }
 
-/* Defined in cloned_binary.c. */
-extern int ensure_cloned_binary(void);
-
 static inline int sane_kill(pid_t pid, int signum)
 {
 	if (pid > 0)
@@ -791,14 +788,6 @@ void nsexec(void)
 		return;
 	}
 
-	/*
-	 * We need to re-exec if we are not in a cloned binary. This is necessary
-	 * to ensure that containers won't be able to access the host binary
-	 * through /proc/self/exe. See CVE-2019-5736.
-	 */
-	if (ensure_cloned_binary() < 0)
-		bail("could not ensure we are a cloned binary");
-
 	/*
 	 * Inform the parent we're past initial setup.
 	 * For the other side of this, see initWaiter.
diff --git a/libcontainer/process.go b/libcontainer/process.go
index 4de4a9e75c2..d2c7bfcda36 100644
--- a/libcontainer/process.go
+++ b/libcontainer/process.go
@@ -49,6 +49,9 @@ type Process struct {
 	// ExtraFiles specifies additional open files to be inherited by the container
 	ExtraFiles []*os.File
 
+	// open handles to cloned binaries -- see dmz.ClonedBinary for more details
+	clonedExes []*os.File
+
 	// Initial sizings for the console
 	ConsoleWidth  uint16
 	ConsoleHeight uint16
@@ -121,6 +124,15 @@ func (p Process) Signal(sig os.Signal) error {
 	return p.ops.signal(sig)
 }
 
+// closeClonedExes cleans up any existing cloned binaries associated with the
+// Process.
+func (p *Process) closeClonedExes() {
+	for _, exe := range p.clonedExes {
+		_ = exe.Close()
+	}
+	p.clonedExes = nil
+}
+
 // IO holds the process's STDIO
 type IO struct {
 	Stdin  io.WriteCloser
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index d2ad5cea229..5acaa4df384 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -4,10 +4,12 @@
 package system
 
 import (
+	"fmt"
 	"os"
 	"os/exec"
 	"unsafe"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -102,3 +104,29 @@ func GetSubreaper() (int, error) {
 
 	return int(i), nil
 }
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+	// The original vm.memfd_noexec=2 implementation incorrectly silently
+	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+	// kernels, we will get -EACCES if we try to use MFD_EXEC with
+	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+	//
+	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+	// kernels where -EINVAL is actually a security denial.
+	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+	if err == unix.EINVAL {
+		memfd, err = unix.MemfdCreate(comment, flags)
+	}
+	if err != nil {
+		if err == unix.EACCES {
+			logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+		}
+		err := os.NewSyscallError("memfd_create", err)
+		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+	}
+	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}

From e089db3b4a31a51c176cc2e26cce9ad0730a8a41 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 19 Aug 2023 02:12:24 +1000
Subject: [PATCH 3/8] dmz: add fallbacks to handle noexec for O_TMPFILE and
 mktemp()

Previously, if /var/run was mounted noexec, our cloned binary logic
would not work if memfd_create(2) was not available because we would try
to exec a binary that is on a noexec filesystem.

We cannot guarantee there will be an executable filesystem on the system
(other than mounting one ourselves, which would cause a bunch of other
headaches) but we can at least try the obvious options (/tmp, /bin, and
/). If none of these work, we will have to fail.

Reported-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/dmz/cloned_binary_linux.go | 57 +++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
index c962ea6fcba..133bdefaff7 100644
--- a/libcontainer/dmz/cloned_binary_linux.go
+++ b/libcontainer/dmz/cloned_binary_linux.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"strconv"
 
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
@@ -19,6 +20,23 @@ var (
 	_ SealFunc = sealFile
 )
 
+func isExecutable(f *os.File) bool {
+	if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
+	if err := unix.Access(path, unix.X_OK); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	// Cannot check -- assume it's executable (if not, exec will fail).
+	logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
+	return true
+}
+
 const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
 
 func sealMemfd(f **os.File) error {
@@ -106,15 +124,46 @@ func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, er
 		return
 	}
 	logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+
+	// The tmpDir here (c.root) might be mounted noexec, so we need a couple of
+	// fallbacks to try. It's possible that none of these are writable and
+	// executable, in which case there's nothing we can practically do (other
+	// than mounting our own executable tmpfs, which would have its own
+	// issues).
+	tmpDirs := []string{
+		tmpDir,
+		os.TempDir(),
+		"/tmp",
+		".",
+		"/bin",
+		"/",
+	}
+
 	// Try to fallback to O_TMPFILE (supported since Linux 3.11).
-	file, sealFn, err = otmpfile(tmpDir)
-	if err == nil {
+	for _, dir := range tmpDirs {
+		file, sealFn, err = otmpfile(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
 		return
 	}
 	logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
 	// Finally, try a classic unlinked temporary file.
-	file, sealFn, err = mktemp(tmpDir)
-	if err == nil {
+	for _, dir := range tmpDirs {
+		file, sealFn, err = mktemp(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
 		return
 	}
 	return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)

From dac41717465462b21fab5b5942fe4cb3f47d7e53 Mon Sep 17 00:00:00 2001
From: lifubang <lifubang@acmcoder.com>
Date: Tue, 15 Aug 2023 17:00:22 +0800
Subject: [PATCH 4/8] runc-dmz: reduce memfd binary cloning cost with small C
 binary

The idea is to remove the need for cloning the entire runc binary by
replacing the final execve() call of the container process with an
execve() call to a clone of a small C binary which just does an execve()
of its arguments.

This provides similar protection against CVE-2019-5736 but without
requiring a >10MB binary copy for each "runc init". When compiled with
musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB
which is still quite large).

It should be noted that there is still a window where the container
processes could get access to the host runc binary, but because we set
ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which
is not enabled by default in Docker) in order to get around the
proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the
kernel blocks access entirely for user namespaced containers in this
scenario. For those cases we cannot use runc-dmz, but most containers
won't have this issue.

This new runc-dmz binary can be opted out of at compile time by setting
the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy
environment variable. In both cases, runc will fall back to the classic
/proc/self/exe-based cloning trick. If /proc/self/exe is already a
sealed memfd (namely if the user is using contrib/cmd/memfd-bind to
create a persistent sealed memfd for runc), neither runc-dmz nor
/proc/self/exe cloning will be used because they are not necessary.

[1]: https://github.com/torvalds/linux/commit/bfedb589252c01fa505ac9f6f2a3d5d68d707ef4

Co-authored-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: lifubang <lifubang@acmcoder.com>
[cyphar: address various review nits]
[cyphar: fix runc-dmz cross-compilation]
[cyphar: embed runc-dmz into runc binary and clone in Go code]
[cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning]
[cyphar: do not use runc-dmz when the container has certain privs]
Co-authored-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/test.yml                    |  17 ++-
 .golangci-extra.yml                           |   1 +
 .golangci.yml                                 |   1 +
 Makefile                                      |  35 ++++-
 README.md                                     |   7 +-
 cc_platform.mk                                |  61 ++++++++
 libcontainer/container_linux.go               |  98 ++++++++++--
 libcontainer/dmz/.gitignore                   |   1 +
 libcontainer/dmz/Makefile                     |   6 +
 libcontainer/dmz/_dmz.c                       |  10 ++
 libcontainer/dmz/dmz.go                       |   9 ++
 libcontainer/dmz/dmz_fallback_linux.go        |   1 +
 libcontainer/dmz/dmz_linux.go                 |  48 ++++++
 libcontainer/dmz/dmz_unsupported.go           |  12 ++
 libcontainer/init_linux.go                    |  18 ++-
 libcontainer/setns_init_linux.go              |  21 ++-
 libcontainer/standard_init_linux.go           |   7 +-
 .../system/kernelversion/kernel_linux.go      |  94 ++++++++++++
 .../system/kernelversion/kernel_linux_test.go | 140 ++++++++++++++++++
 libcontainer/system/linux.go                  |  46 +++++-
 20 files changed, 608 insertions(+), 25 deletions(-)
 create mode 100644 cc_platform.mk
 create mode 100644 libcontainer/dmz/.gitignore
 create mode 100644 libcontainer/dmz/Makefile
 create mode 100644 libcontainer/dmz/_dmz.c
 create mode 100644 libcontainer/dmz/dmz.go
 create mode 100644 libcontainer/dmz/dmz_fallback_linux.go
 create mode 100644 libcontainer/dmz/dmz_linux.go
 create mode 100644 libcontainer/dmz/dmz_unsupported.go
 create mode 100644 libcontainer/system/kernelversion/kernel_linux.go
 create mode 100644 libcontainer/system/kernelversion/kernel_linux_test.go

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7f6799f18f6..5a53f1de1eb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -28,6 +28,7 @@ jobs:
         rootless: ["rootless", ""]
         race: ["-race", ""]
         criu: ["", "criu-dev"]
+        dmz: ["", "runc_nodmz"]
         exclude:
           - criu: criu-dev
             rootless: rootless
@@ -35,6 +36,10 @@ jobs:
             go-version: 1.20.x
           - criu: criu-dev
             race: -race
+          - dmz: runc_nodmz
+            criu: criu-dev
+          - dmz: runc_nodmz
+            os: ubuntu-20.04
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -71,6 +76,8 @@ jobs:
         go-version: ${{ matrix.go-version }}
 
     - name: build
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
 
     - name: install bats
@@ -80,6 +87,8 @@ jobs:
 
     - name: unit test
       if: matrix.rootless != 'rootless'
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
 
     - name: add rootless user
@@ -113,8 +122,12 @@ jobs:
   # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
   # We are not interested in providing official support for i386.
   cross-i386:
-    runs-on: ubuntu-22.04
     timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        dmz: ["", "runc_nodmz"]
+    runs-on: ubuntu-22.04
 
     steps:
 
@@ -136,4 +149,6 @@ jobs:
         go-version: 1.x # Latest stable
 
     - name: unit test
+      env:
+        EXTRA_BUILDTAGS: ${{ matrix.dmz }}
       run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
diff --git a/.golangci-extra.yml b/.golangci-extra.yml
index be33f90d7f9..23b57e040b6 100644
--- a/.golangci-extra.yml
+++ b/.golangci-extra.yml
@@ -7,6 +7,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   disable-all: true
diff --git a/.golangci.yml b/.golangci.yml
index 96b321019e4..c088117d2ca 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -3,6 +3,7 @@
 run:
   build-tags:
     - seccomp
+    - runc_nodmz
 
 linters:
   enable:
diff --git a/Makefile b/Makefile
index 4e90e1c7c4f..d39df7d0752 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,11 @@
+SHELL = /bin/bash
+
 CONTAINER_ENGINE := docker
 GO ?= go
 
+# Get CC values for cross-compilation.
+include cc_platform.mk
+
 PREFIX ?= /usr/local
 BINDIR := $(PREFIX)/sbin
 MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
 RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
 PROJECT := github.com/opencontainers/runc
 BUILDTAGS ?= seccomp urfave_cli_no_docs
+BUILDTAGS += $(EXTRA_BUILDTAGS)
 
 COMMIT ?= $(shell git describe --dirty --long --always)
 VERSION := $(shell cat ./VERSION)
@@ -57,16 +63,23 @@ endif
 
 .DEFAULT: runc
 
-runc:
+runc: runc-dmz
 	$(GO_BUILD) -o runc .
+	make verify-dmz-arch
 
 all: runc recvtty sd-helper seccompagent fs-idmap
 
 recvtty sd-helper seccompagent fs-idmap:
 	$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
 
-static:
+static: runc-dmz
 	$(GO_BUILD_STATIC) -o runc .
+	make verify-dmz-arch
+
+.PHONY: runc-dmz
+runc-dmz:
+	rm -f libcontainer/dmz/runc-dmz
+	$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
 
 releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
 releaseall: release
@@ -147,12 +160,12 @@ install-man: man
 	install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
 
 clean:
-	rm -f runc runc-*
+	rm -f runc runc-* libcontainer/dmz/runc-dmz
 	rm -f contrib/cmd/recvtty/recvtty
 	rm -f contrib/cmd/sd-helper/sd-helper
 	rm -f contrib/cmd/seccompagent/seccompagent
 	rm -f contrib/cmd/fs-idmap/fs-idmap
-	rm -rf release
+	sudo rm -rf release
 	rm -rf man/man8
 
 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +201,18 @@ verify-dependencies: vendor
 	@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
 		|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
 		&& echo "all vendor files are up to date."
+verify-dmz-arch:
+	@test -s libcontainer/dmz/runc-dmz || exit 0; \
+		set -Eeuo pipefail; \
+		export LC_ALL=C; \
+		echo "readelf -h runc"; \
+		readelf -h runc | grep -E "(Machine|Flags):"; \
+		echo "readelf -h libcontainer/dmz/runc-dmz"; \
+		readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
+		diff -u \
+			<(readelf -h runc | grep -E "(Machine|Flags):") \
+			<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
+		&& echo "runc-dmz architecture matches runc binary."
 
 validate-keyring:
 	script/keyring_validate.sh
@@ -197,4 +222,4 @@ validate-keyring:
 	test localtest unittest localunittest integration localintegration \
 	rootlessintegration localrootlessintegration shell install install-bash \
 	install-man clean cfmt shfmt localshfmt shellcheck \
-	vendor verify-changelog verify-dependencies validate-keyring
+	vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
diff --git a/README.md b/README.md
index b209c7dcd55..da9786632da 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,10 @@ e.g. to disable seccomp:
 make BUILDTAGS=""
 ```
 
-| Build Tag | Feature                            | Enabled by default | Dependency |
-|-----------|------------------------------------|--------------------|------------|
-| seccomp   | Syscall filtering                  | yes                | libseccomp |
+| Build Tag     | Feature                               | Enabled by Default | Dependencies        |
+|---------------|---------------------------------------|--------------------|---------------------|
+| `seccomp`     | Syscall filtering using `libseccomp`. | yes                | `libseccomp`        |
+| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
 
 The following build tags were used earlier, but are now obsoleted:
  - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
diff --git a/cc_platform.mk b/cc_platform.mk
new file mode 100644
index 00000000000..6aa2b5ecb8b
--- /dev/null
+++ b/cc_platform.mk
@@ -0,0 +1,61 @@
+# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
+
+GO ?= go
+GOARCH ?= $(shell $(GO) env GOARCH)
+
+ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
+	# openSUSE has a custom PLATFORM
+	PLATFORM ?= suse-linux
+	IS_SUSE := 1
+else
+	PLATFORM ?= linux-gnu
+endif
+
+ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
+	# use the native CC and STRIP
+	HOST :=
+else ifeq ($(GOARCH),386)
+	# Always use the 64-bit compiler to build the 386 binary, which works for
+	# the more common cross-build method for x86 (namely, the equivalent of
+	# dpkg --add-architecture).
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+		CPU_TYPE := i586
+	else
+		HOST := x86_64-$(PLATFORM)-
+		CPU_TYPE := i686
+	endif
+	CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
+else ifeq ($(GOARCH),amd64)
+	ifdef IS_SUSE
+		# There is no x86_64-suse-linux-gcc, so use the native one.
+		HOST :=
+	else
+		HOST := x86_64-$(PLATFORM)-
+	endif
+else ifeq ($(GOARCH),arm64)
+	HOST := aarch64-$(PLATFORM)-
+else ifeq ($(GOARCH),arm)
+	# HOST already configured by release_build.sh in this case.
+else ifeq ($(GOARCH),armel)
+	HOST := arm-$(PLATFORM)eabi-
+else ifeq ($(GOARCH),armhf)
+	HOST := arm-$(PLATFORM)eabihf-
+else ifeq ($(GOARCH),ppc64le)
+	HOST := powerpc64le-$(PLATFORM)-
+else ifeq ($(GOARCH),riscv64)
+	HOST := riscv64-$(PLATFORM)-
+else ifeq ($(GOARCH),s390x)
+	HOST := s390x-$(PLATFORM)-
+else
+$(error Unsupported GOARCH $(GOARCH))
+endif
+
+ifeq ($(origin CC),$(filter $(origin CC),undefined default))
+	# Override CC if it's undefined or just the default value set by Make.
+	CC := $(HOST)gcc
+	export CC
+endif
+STRIP ?= $(HOST)strip
+export STRIP
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 9eb72f86600..e10ca2332ff 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -27,6 +27,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/dmz"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/system/kernelversion"
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
@@ -444,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
 	return nil
 }
 
+// No longer needed in Go 1.21.
+func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
+	for _, val := range slice {
+		if val == needle {
+			return true
+		}
+	}
+	return false
+}
+
+func isDmzBinarySafe(c *configs.Config) bool {
+	// Because we set the dumpable flag in nsexec, the only time when it is
+	// unsafe to use runc-dmz is when the container process would be able to
+	// race against "runc init" and bypass the ptrace_may_access() checks.
+	//
+	// This is only the case if the container processes could have
+	// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
+	// inheritable, or ambient sets). Luckily, most containers do not have this
+	// capability.
+	if c.Capabilities == nil ||
+		(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
+			!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
+			!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
+		return true
+	}
+
+	// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
+	// access /proc/$pid/exe of runc after it joins the namespace (until it
+	// does an exec), regardless of the capability set. This has been
+	// backported to other distribution kernels, but there's no way of checking
+	// this cheaply -- better to be safe than sorry here.
+	linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
+	if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
+		if c.Namespaces.Contains(configs.NEWUSER) {
+			return true
+		}
+	}
+
+	// Assume it's unsafe otherwise.
+	return false
+}
+
 func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
 	if err != nil {
@@ -457,27 +500,54 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	}
 	logFilePair := filePair{parentLogPipe, childLogPipe}
 
-	// Make sure we use a new safe copy of /proc/self/exe each time this is
-	// called, to make sure that if a container manages to overwrite the file
-	// it cannot affect other containers on the system. For runc, this code
-	// will only ever be called once, but libcontainer users might call this
-	// more than once.
+	// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
+	// binary each time this is called, to make sure that if a container
+	// manages to overwrite the file it cannot affect other containers on the
+	// system. For runc, this code will only ever be called once, but
+	// libcontainer users might call this more than once.
 	p.closeClonedExes()
 	var (
 		exePath string
-		safeExe *os.File
+		// only one of dmzExe or safeExe are used at a time
+		dmzExe, safeExe *os.File
 	)
 	if dmz.IsSelfExeCloned() {
 		// /proc/self/exe is already a cloned binary -- no need to do anything
 		logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
 		exePath = "/proc/self/exe"
 	} else {
-		safeExe, err = dmz.CloneSelfExe(c.root)
-		if err != nil {
-			return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
+		var err error
+		if isDmzBinarySafe(c.config) {
+			dmzExe, err = dmz.Binary(c.root)
+			if err == nil {
+				// We can use our own executable without cloning if we are using
+				// runc-dmz.
+				exePath = "/proc/self/exe"
+				p.clonedExes = append(p.clonedExes, dmzExe)
+			} else if errors.Is(err, dmz.ErrNoDmzBinary) {
+				logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
+			} else if err != nil {
+				return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
+			}
+		} else {
+			// If the configuration makes it unsafe to use runc-dmz, pretend we
+			// don't have it embedded so we do /proc/self/exe cloning.
+			logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
+			err = dmz.ErrNoDmzBinary
+		}
+		if errors.Is(err, dmz.ErrNoDmzBinary) {
+			safeExe, err = dmz.CloneSelfExe(c.root)
+			if err != nil {
+				return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
+			}
+			exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
+			p.clonedExes = append(p.clonedExes, safeExe)
+		}
+		// Just to make sure we don't run without protection.
+		if dmzExe == nil && safeExe == nil {
+			// This should never happen.
+			return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
 		}
-		exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
-		p.clonedExes = append(p.clonedExes, safeExe)
 	}
 
 	cmd := exec.Command(exePath, "init")
@@ -503,6 +573,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 		"_LIBCONTAINER_STATEDIR="+c.root,
 	)
 
+	if dmzExe != nil {
+		cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
+		cmd.Env = append(cmd.Env,
+			"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
+	}
+
 	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
 	cmd.Env = append(cmd.Env,
 		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
diff --git a/libcontainer/dmz/.gitignore b/libcontainer/dmz/.gitignore
new file mode 100644
index 00000000000..f163ef41c1f
--- /dev/null
+++ b/libcontainer/dmz/.gitignore
@@ -0,0 +1 @@
+/runc-dmz
diff --git a/libcontainer/dmz/Makefile b/libcontainer/dmz/Makefile
new file mode 100644
index 00000000000..24e92db716b
--- /dev/null
+++ b/libcontainer/dmz/Makefile
@@ -0,0 +1,6 @@
+# Get CC values for cross-compilation.
+include ../../cc_platform.mk
+
+runc-dmz: _dmz.c
+	$(CC) $(CFLAGS) -static -o $@ $^
+	$(STRIP) -gs $@
diff --git a/libcontainer/dmz/_dmz.c b/libcontainer/dmz/_dmz.c
new file mode 100644
index 00000000000..6e91b0f90a9
--- /dev/null
+++ b/libcontainer/dmz/_dmz.c
@@ -0,0 +1,10 @@
+#include <unistd.h>
+
+extern char **environ;
+
+int main(int argc, char **argv)
+{
+	if (argc < 1)
+		return 127;
+	return execve(argv[0], argv, environ);
+}
diff --git a/libcontainer/dmz/dmz.go b/libcontainer/dmz/dmz.go
new file mode 100644
index 00000000000..9b6b500807c
--- /dev/null
+++ b/libcontainer/dmz/dmz.go
@@ -0,0 +1,9 @@
+package dmz
+
+import (
+	"errors"
+)
+
+// ErrNoDmzBinary is returned by Binary when there is no runc-dmz binary
+// embedded in the runc program.
+var ErrNoDmzBinary = errors.New("runc-dmz binary not embedded in this program")
diff --git a/libcontainer/dmz/dmz_fallback_linux.go b/libcontainer/dmz/dmz_fallback_linux.go
new file mode 100644
index 00000000000..4f624e048b9
--- /dev/null
+++ b/libcontainer/dmz/dmz_fallback_linux.go
@@ -0,0 +1 @@
+package dmz
diff --git a/libcontainer/dmz/dmz_linux.go b/libcontainer/dmz/dmz_linux.go
new file mode 100644
index 00000000000..12f9709a269
--- /dev/null
+++ b/libcontainer/dmz/dmz_linux.go
@@ -0,0 +1,48 @@
+//go:build !runc_nodmz
+// +build !runc_nodmz
+
+package dmz
+
+import (
+	"bytes"
+	"debug/elf"
+	_ "embed"
+	"os"
+
+	"github.com/sirupsen/logrus"
+)
+
+// Try to build the runc-dmz binary. If it fails, replace it with an empty file
+// (this will trigger us to fall back to a clone of /proc/self/exe). Yeah, this
+// is a bit ugly but it makes sure that weird cross-compilation setups don't
+// break because of runc-dmz.
+//
+//go:generate sh -c "make -B runc-dmz || echo -n >runc-dmz"
+//go:embed runc-dmz
+var runcDmzBinary []byte
+
+// Binary returns a cloned copy (see CloneBinary) of a very minimal C program
+// that just does an execve() of its arguments. This is used in the final
+// execution step of the container execution as an intermediate process before
+// the container process is execve'd. This allows for protection against
+// CVE-2019-5736 without requiring a complete copy of the runc binary. Each
+// call to Binary will return a new copy.
+//
+// If the runc-dmz binary is not embedded into the runc binary, Binary will
+// return ErrNoDmzBinary as the error.
+func Binary(tmpDir string) (*os.File, error) {
+	rdr := bytes.NewBuffer(runcDmzBinary)
+	// Verify that our embedded binary has a standard ELF header.
+	if !bytes.HasPrefix(rdr.Bytes(), []byte(elf.ELFMAG)) {
+		if rdr.Len() != 0 {
+			logrus.Infof("misconfigured build: embedded runc-dmz binary is non-empty but is missing a proper ELF header")
+		}
+		return nil, ErrNoDmzBinary
+	}
+	// Setting RUNC_DMZ=legacy disables this dmz method.
+	if os.Getenv("RUNC_DMZ") == "legacy" {
+		logrus.Debugf("RUNC_DMZ=legacy set -- switching back to classic /proc/self/exe cloning")
+		return nil, ErrNoDmzBinary
+	}
+	return CloneBinary(rdr, int64(rdr.Len()), "runc-dmz", tmpDir)
+}
diff --git a/libcontainer/dmz/dmz_unsupported.go b/libcontainer/dmz/dmz_unsupported.go
new file mode 100644
index 00000000000..2ba67270495
--- /dev/null
+++ b/libcontainer/dmz/dmz_unsupported.go
@@ -0,0 +1,12 @@
+//go:build !linux || runc_nodmz
+// +build !linux runc_nodmz
+
+package dmz
+
+import (
+	"os"
+)
+
+func Binary(_ string) (*os.File, error) {
+	return nil, ErrNoDmzBinary
+}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 732f64dc660..a24be276878 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -182,6 +182,17 @@ func startInitialization() (retErr error) {
 		return err
 	}
 
+	// Get runc-dmz fds.
+	var dmzExe *os.File
+	if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
+		dmzFd, err := strconv.Atoi(dmzFdStr)
+		if err != nil {
+			return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err)
+		}
+		unix.CloseOnExec(dmzFd)
+		dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz")
+	}
+
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
@@ -197,10 +208,10 @@ func startInitialization() (retErr error) {
 	}()
 
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
 }
 
-func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
+func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return err
@@ -208,6 +219,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 	if err := populateProcessEnvironment(config.Env); err != nil {
 		return err
 	}
+
 	switch t {
 	case initSetns:
 		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
@@ -220,6 +232,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			consoleSocket: consoleSocket,
 			config:        config,
 			logFd:         logFd,
+			dmzExe:        dmzExe,
 		}
 		return i.Init()
 	case initStandard:
@@ -230,6 +243,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			config:        config,
 			fifoFd:        fifoFd,
 			logFd:         logFd,
+			dmzExe:        dmzExe,
 			mountFds:      mountFds,
 		}
 		return i.Init()
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 40a47a2e95c..7709219300b 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"os/exec"
 	"strconv"
 
 	"github.com/opencontainers/selinux/go-selinux"
@@ -23,6 +24,7 @@ type linuxSetnsInit struct {
 	consoleSocket *os.File
 	config        *initConfig
 	logFd         int
+	dmzExe        *os.File
 }
 
 func (l *linuxSetnsInit) getSessionRingName() string {
@@ -85,6 +87,18 @@ func (l *linuxSetnsInit) Init() error {
 	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
+	// Check for the arg early to make sure it exists.
+	name, err := exec.LookPath(l.config.Args[0])
+	if err != nil {
+		return err
+	}
+	// exec.LookPath in Go < 1.20 might return no error for an executable
+	// residing on a file system mounted with noexec flag, so perform this
+	// extra check now while we can still return a proper error.
+	// TODO: remove this once go < 1.20 is not supported.
+	if err := eaccess(name); err != nil {
+		return &os.PathError{Op: "eaccess", Path: name, Err: err}
+	}
 	// Set seccomp as close to execve as possible, so as few syscalls take
 	// place afterward (reducing the amount of syscalls that users need to
 	// enable in their seccomp profiles).
@@ -98,10 +112,15 @@ func (l *linuxSetnsInit) Init() error {
 		}
 	}
 	logrus.Debugf("setns_init: about to exec")
+
 	// Close the log pipe fd so the parent's ForwardLogs can exit.
 	if err := unix.Close(l.logFd); err != nil {
 		return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
 	}
 
-	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+	if l.dmzExe != nil {
+		l.config.Args[0] = name
+		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
+	}
+	return system.Exec(name, l.config.Args, os.Environ())
 }
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index c64173ecfc3..4eb3d8db435 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -25,6 +25,7 @@ type linuxStandardInit struct {
 	parentPid     int
 	fifoFd        int
 	logFd         int
+	dmzExe        *os.File
 	mountFds      mountFds
 	config        *initConfig
 }
@@ -262,5 +263,9 @@ func (l *linuxStandardInit) Init() error {
 		return err
 	}
 
-	return system.Exec(name, l.config.Args[0:], os.Environ())
+	if l.dmzExe != nil {
+		l.config.Args[0] = name
+		return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
+	}
+	return system.Exec(name, l.config.Args, os.Environ())
 }
diff --git a/libcontainer/system/kernelversion/kernel_linux.go b/libcontainer/system/kernelversion/kernel_linux.go
new file mode 100644
index 00000000000..ca5d4130d0c
--- /dev/null
+++ b/libcontainer/system/kernelversion/kernel_linux.go
@@ -0,0 +1,94 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   File copied and customized based on
+   https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
+
+   File copied from
+   https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
+*/
+
+package kernelversion
+
+import (
+	"bytes"
+	"fmt"
+	"sync"
+
+	"golang.org/x/sys/unix"
+)
+
+// KernelVersion holds information about the kernel.
+type KernelVersion struct {
+	Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
+	Major  uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
+}
+
+func (k *KernelVersion) String() string {
+	if k.Kernel > 0 || k.Major > 0 {
+		return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
+	}
+	return ""
+}
+
+var (
+	currentKernelVersion *KernelVersion
+	kernelVersionError   error
+	once                 sync.Once
+)
+
+// getKernelVersion gets the current kernel version.
+func getKernelVersion() (*KernelVersion, error) {
+	once.Do(func() {
+		var uts unix.Utsname
+		if err := unix.Uname(&uts); err != nil {
+			return
+		}
+		// Remove the \x00 from the release for Atoi to parse correctly
+		currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
+	})
+	return currentKernelVersion, kernelVersionError
+}
+
+// parseRelease parses a string and creates a KernelVersion based on it.
+func parseRelease(release string) (*KernelVersion, error) {
+	var version KernelVersion
+
+	// We're only make sure we get the "kernel" and "major revision". Sometimes we have
+	// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
+	_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
+	}
+	return &version, nil
+}
+
+// GreaterEqualThan checks if the host's kernel version is greater than, or
+// equal to the given kernel version v. Only "kernel version" and "major revision"
+// can be specified (e.g., "3.12") and will be taken into account, which means
+// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
+func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
+	kv, err := getKernelVersion()
+	if err != nil {
+		return false, err
+	}
+	if kv.Kernel > minVersion.Kernel {
+		return true, nil
+	}
+	if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
+		return true, nil
+	}
+	return false, nil
+}
diff --git a/libcontainer/system/kernelversion/kernel_linux_test.go b/libcontainer/system/kernelversion/kernel_linux_test.go
new file mode 100644
index 00000000000..a18f1f2226f
--- /dev/null
+++ b/libcontainer/system/kernelversion/kernel_linux_test.go
@@ -0,0 +1,140 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   File copied and customized based on
+   https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux_test.go
+*/
+
+package kernelversion
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestGetKernelVersion(t *testing.T) {
+	version, err := getKernelVersion()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if version == nil {
+		t.Fatal("version is nil")
+	}
+	if version.Kernel == 0 {
+		t.Fatal("no kernel version")
+	}
+}
+
+func TestParseRelease(t *testing.T) {
+	tests := []struct {
+		in          string
+		out         KernelVersion
+		expectedErr error
+	}{
+		{in: "3.8", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.8.0", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.8.0-19-generic", out: KernelVersion{Kernel: 3, Major: 8}},
+		{in: "3.4.54.longterm-1", out: KernelVersion{Kernel: 3, Major: 4}},
+		{in: "3.10.0-862.2.3.el7.x86_64", out: KernelVersion{Kernel: 3, Major: 10}},
+		{in: "3.12.8tag", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "3.12-1-amd64", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "3.12foobar", out: KernelVersion{Kernel: 3, Major: 12}},
+		{in: "99.999.999-19-generic", out: KernelVersion{Kernel: 99, Major: 999}},
+		{in: "", expectedErr: fmt.Errorf(`failed to parse kernel version "": EOF`)},
+		{in: "3", expectedErr: fmt.Errorf(`failed to parse kernel version "3": unexpected EOF`)},
+		{in: "3.", expectedErr: fmt.Errorf(`failed to parse kernel version "3.": EOF`)},
+		{in: "3a", expectedErr: fmt.Errorf(`failed to parse kernel version "3a": input does not match format`)},
+		{in: "3.a", expectedErr: fmt.Errorf(`failed to parse kernel version "3.a": expected integer`)},
+		{in: "a", expectedErr: fmt.Errorf(`failed to parse kernel version "a": expected integer`)},
+		{in: "a.a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a": expected integer`)},
+		{in: "a.a.a-a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a.a-a": expected integer`)},
+		{in: "-3", expectedErr: fmt.Errorf(`failed to parse kernel version "-3": expected integer`)},
+		{in: "-3.", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.": expected integer`)},
+		{in: "-3.8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.8": expected integer`)},
+		{in: "-3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.-8": expected integer`)},
+		{in: "3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "3.-8": expected integer`)},
+	}
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.in, func(t *testing.T) {
+			version, err := parseRelease(tc.in)
+			if tc.expectedErr != nil {
+				if err == nil {
+					t.Fatal("expected an error")
+				}
+				if err.Error() != tc.expectedErr.Error() {
+					t.Fatalf("expected: %s, got: %s", tc.expectedErr, err)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatal("unexpected error:", err)
+			}
+			if version == nil {
+				t.Fatal("version is nil")
+			}
+			if version.Kernel != tc.out.Kernel || version.Major != tc.out.Major {
+				t.Fatalf("expected: %d.%d, got: %d.%d", tc.out.Kernel, tc.out.Major, version.Kernel, version.Major)
+			}
+		})
+	}
+}
+
+func TestGreaterEqualThan(t *testing.T) {
+	// Get the current kernel version, so that we can make test relative to that
+	v, err := getKernelVersion()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tests := []struct {
+		doc      string
+		in       KernelVersion
+		expected bool
+	}{
+		{
+			doc:      "same version",
+			in:       KernelVersion{v.Kernel, v.Major},
+			expected: true,
+		},
+		{
+			doc:      "kernel minus one",
+			in:       KernelVersion{v.Kernel - 1, v.Major},
+			expected: true,
+		},
+		{
+			doc:      "kernel plus one",
+			in:       KernelVersion{v.Kernel + 1, v.Major},
+			expected: false,
+		},
+		{
+			doc:      "major plus one",
+			in:       KernelVersion{v.Kernel, v.Major + 1},
+			expected: false,
+		},
+	}
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.doc+": "+tc.in.String(), func(t *testing.T) {
+			ok, err := GreaterEqualThan(tc.in)
+			if err != nil {
+				t.Fatal("unexpected error:", err)
+			}
+			if ok != tc.expected {
+				t.Fatalf("expected: %v, got: %v", tc.expected, ok)
+			}
+		})
+	}
+}
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index 5acaa4df384..8f52496e9bb 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -7,6 +7,8 @@ import (
 	"fmt"
 	"os"
 	"os/exec"
+	"strconv"
+	"syscall"
 	"unsafe"
 
 	"github.com/sirupsen/logrus"
@@ -38,7 +40,6 @@ func Execv(cmd string, args []string, env []string) error {
 	if err != nil {
 		return err
 	}
-
 	return Exec(name, args, env)
 }
 
@@ -51,6 +52,49 @@ func Exec(cmd string, args []string, env []string) error {
 	}
 }
 
+func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
+	pathnamep, err := syscall.BytePtrFromString(pathname)
+	if err != nil {
+		return err
+	}
+
+	argvp, err := syscall.SlicePtrFromStrings(args)
+	if err != nil {
+		return err
+	}
+
+	envp, err := syscall.SlicePtrFromStrings(env)
+	if err != nil {
+		return err
+	}
+
+	_, _, errno := syscall.Syscall6(
+		unix.SYS_EXECVEAT,
+		fd,
+		uintptr(unsafe.Pointer(pathnamep)),
+		uintptr(unsafe.Pointer(&argvp[0])),
+		uintptr(unsafe.Pointer(&envp[0])),
+		uintptr(flags),
+		0,
+	)
+	return errno
+}
+
+func Fexecve(fd uintptr, args []string, env []string) error {
+	var err error
+	for {
+		err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
+		if err != unix.EINTR { // nolint:errorlint // unix errors are bare
+			break
+		}
+	}
+	if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
+		// Fallback to classic /proc/self/fd/... exec.
+		return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
+	}
+	return os.NewSyscallError("execveat", err)
+}
+
 func SetParentDeathSignal(sig uintptr) error {
 	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
 		return err

From b9a4727f54c02bda3e3ad87b6340980cc73f9b42 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 11 Jul 2023 22:20:48 +1000
Subject: [PATCH 5/8] contrib: memfd-bind: add helper for memfd-sealed-bind
 trick

This really isn't ideal but it can be used to avoid the largest issues
with the memfd-based runc binary protection. There are several caveats
with using this tool, see the help page for the new binary for details.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .gitignore                                 |   9 +-
 Makefile                                   |   7 +-
 README.md                                  |   4 +-
 contrib/cmd/memfd-bind/README.md           |  67 ++++++
 contrib/cmd/memfd-bind/memfd-bind.go       | 240 +++++++++++++++++++++
 contrib/cmd/memfd-bind/memfd-bind@.service |  11 +
 6 files changed, 330 insertions(+), 8 deletions(-)
 create mode 100644 contrib/cmd/memfd-bind/README.md
 create mode 100644 contrib/cmd/memfd-bind/memfd-bind.go
 create mode 100644 contrib/cmd/memfd-bind/memfd-bind@.service

diff --git a/.gitignore b/.gitignore
index 4df0d6abfde..f022ed275cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,11 @@
 vendor/pkg
 /runc
 /runc-*
-contrib/cmd/recvtty/recvtty
-contrib/cmd/sd-helper/sd-helper
-contrib/cmd/seccompagent/seccompagent
-contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/recvtty/recvtty
+/contrib/cmd/sd-helper/sd-helper
+/contrib/cmd/seccompagent/seccompagent
+/contrib/cmd/fs-idmap/fs-idmap
+/contrib/cmd/memfd-bind/memfd-bind
 man/man8
 release
 Vagrantfile
diff --git a/Makefile b/Makefile
index d39df7d0752..d3c1c11cb86 100644
--- a/Makefile
+++ b/Makefile
@@ -67,9 +67,9 @@ runc: runc-dmz
 	$(GO_BUILD) -o runc .
 	make verify-dmz-arch
 
-all: runc recvtty sd-helper seccompagent fs-idmap
+all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
 
-recvtty sd-helper seccompagent fs-idmap:
+recvtty sd-helper seccompagent fs-idmap memfd-bind:
 	$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
 
 static: runc-dmz
@@ -161,10 +161,11 @@ install-man: man
 
 clean:
 	rm -f runc runc-* libcontainer/dmz/runc-dmz
+	rm -f contrib/cmd/fs-idmap/fs-idmap
 	rm -f contrib/cmd/recvtty/recvtty
 	rm -f contrib/cmd/sd-helper/sd-helper
 	rm -f contrib/cmd/seccompagent/seccompagent
-	rm -f contrib/cmd/fs-idmap/fs-idmap
+	rm -f contrib/cmd/memfd-bind/memfd-bind
 	sudo rm -rf release
 	rm -rf man/man8
 
diff --git a/README.md b/README.md
index da9786632da..827f837e06f 100644
--- a/README.md
+++ b/README.md
@@ -68,13 +68,15 @@ make BUILDTAGS=""
 | Build Tag     | Feature                               | Enabled by Default | Dependencies        |
 |---------------|---------------------------------------|--------------------|---------------------|
 | `seccomp`     | Syscall filtering using `libseccomp`. | yes                | `libseccomp`        |
-| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
+| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
 
 The following build tags were used earlier, but are now obsoleted:
  - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
  - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
  - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
+ [contrib-memfd-bind]: /contrib/memfd-bind/README.md
+
 ### Running the test suite
 
 `runc` currently supports running its test suite via Docker.
diff --git a/contrib/cmd/memfd-bind/README.md b/contrib/cmd/memfd-bind/README.md
new file mode 100644
index 00000000000..f2ceae2fa78
--- /dev/null
+++ b/contrib/cmd/memfd-bind/README.md
@@ -0,0 +1,67 @@
+## memfd-bind ##
+
+`runc` normally has to make a binary copy of itself (or of a smaller helper
+binary called `runc-dmz`) when constructing a container process in order to
+defend against certain container runtime attacks such as CVE-2019-5736.
+
+This cloned binary only exists until the container process starts (this means
+for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
+-- for `runc create` it exists until `runc start` is called). However, because
+the clone is done using a memfd (or by creating files in directories that are
+likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
+usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
+controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
+enabled, there is no effect on the container's memory.
+
+However, for certain configurations this can still be undesirable. This daemon
+allows you to create a sealed memfd copy of the `runc` binary, which will cause
+`runc` to skip all binary copying, resulting in no additional memory usage for
+each container process (instead there is a single in-memory copy of the
+binary). It should be noted that (strictly speaking) this is slightly less
+secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
+but for most users the security benefit is identical.
+
+The provided `memfd-bind@.service` file can be used to get systemd to manage
+this daemon. You can supply the path like so:
+
+```
+% systemctl start memfd-bind@/usr/bin/runc
+```
+
+Thus, there are three ways of protecting against CVE-2019-5736, in order of how
+much memory usage they can use:
+
+* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
+  10MB), regardless of how many containers are running.
+
+* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
+  1MB in size, and a copy is created once per process spawned inside a
+  container by runc (both the pid1 and every `runc exec`). There are
+  circumstances where using `runc-dmz` will fail in ways that runc cannot
+  predict ahead of time (such as restrictive LSMs applied to containers), in
+  which case users can disable it with the `RUNC_DMZ=legacy` setting.
+  `runc-dmz` also requires an additional `execve` over the other options,
+  though since the binary is so small the cost is probably not even noticeable.
+
+* The classic method of making a copy of the entire `runc` binary during
+  container process setup takes up about 10MB per process spawned inside the
+  container by runc (both pid1 and `runc exec`).
+
+### Caveats ###
+
+There are several downsides with using `memfd-bind` on the `runc` binary:
+
+* The `memfd-bind` process needs to continue to run indefinitely in order for
+  the memfd reference to stay alive. If the process is forcefully killed, the
+  bind-mount on top of the `runc` binary will become stale and nobody will be
+  able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
+  mount).
+
+* Only root can execute the cloned binary due to permission restrictions on
+  accessing other process's files. More specifically, only users with ptrace
+  privileges over the memfd-bind daemon can access the file (but in practice
+  this is usually only root).
+
+* When updating `runc`, the daemon needs to be stopped before the update (so
+  the package manager can access the underlying file) and then restarted after
+  the update.
diff --git a/contrib/cmd/memfd-bind/memfd-bind.go b/contrib/cmd/memfd-bind/memfd-bind.go
new file mode 100644
index 00000000000..e73739f0c4d
--- /dev/null
+++ b/contrib/cmd/memfd-bind/memfd-bind.go
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023 SUSE LLC
+ * Copyright (c) 2023 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"os/signal"
+	"runtime"
+	"strings"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer/dmz"
+
+	"github.com/sirupsen/logrus"
+	"github.com/urfave/cli"
+	"golang.org/x/sys/unix"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile.
+var gitCommit = ""
+
+const (
+	usage = `Open Container Initiative contrib/cmd/memfd-bind
+
+In order to protect against certain container attacks, every runc invocation
+that involves creating or joining a container will cause runc to make a copy of
+the runc binary in memory (usually to a memfd). While "runc init" is very
+short-lived, this extra memory usage can cause problems for containers with
+very small memory limits (or containers that have many "runc exec" invocations
+applied to them at the same time).
+
+memfd-bind is a tool to create a persistent memfd-sealed-copy of the runc binary,
+which will cause runc to not make its own copy. This means you can get the
+benefits of using a sealed memfd as runc's binary (even in a container breakout
+attack to get write access to the runc binary, neither the underlying binary
+nor the memfd copy can be changed).
+
+To use memfd-bind, just specify which path you want to create a socket path at
+which you want to receive terminals:
+
+    $ sudo memfd-bind /usr/bin/runc
+
+Note that (due to kernel restrictions on bind-mounts), this program must remain
+running on the host in order for the binary to be readable (it is recommended
+you use a systemd unit to keep this process around).
+
+If this program dies, there will be a leftover mountpoint that always returns
+-EINVAL when attempting to access it. You need to use memfd-bind --cleanup on the
+path in order to unmount the path (regular umount(8) will not work):
+
+    $ sudo memfd-bind --cleanup /usr/bin/runc
+
+Note that (due to restrictions on /proc/$pid/fd/$fd magic-link resolution),
+only privileged users (specifically, those that have ptrace privileges over the
+memfd-bind daemon) can access the memfd bind-mount. This means that using this
+tool to harden your /usr/bin/runc binary would result in unprivileged users
+being unable to execute the binary. If this is an issue, you could make all
+privileged process use a different copy of runc (by making a copy in somewhere
+like /usr/sbin/runc) and only using memfd-bind for the version used by
+privileged users.
+`
+)
+
+func cleanup(path string) error {
+	file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("cleanup: failed to open runc binary path: %w", err)
+	}
+	defer file.Close()
+	fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd())
+
+	// Keep umounting until we hit a umount error.
+	for unix.Unmount(fdPath, unix.MNT_DETACH) == nil {
+		// loop...
+		logrus.Debugf("memfd-bind: path %q unmount succeeded...", path)
+	}
+	logrus.Infof("memfd-bind: path %q has been cleared of all old bind-mounts", path)
+	return nil
+}
+
+// memfdClone is a memfd-only implementation of dmz.CloneBinary.
+func memfdClone(path string) (*os.File, error) {
+	binFile, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open runc binary path: %w", err)
+	}
+	defer binFile.Close()
+	stat, err := binFile.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("checking %s size: %w", path, err)
+	}
+	size := stat.Size()
+	memfd, sealFn, err := dmz.Memfd("/proc/self/exe")
+	if err != nil {
+		return nil, fmt.Errorf("creating memfd failed: %w", err)
+	}
+	copied, err := io.Copy(memfd, binFile)
+	if err != nil {
+		return nil, fmt.Errorf("copy binary: %w", err)
+	} else if copied != size {
+		return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+	}
+	if err := sealFn(&memfd); err != nil {
+		return nil, fmt.Errorf("could not seal fd: %w", err)
+	}
+	if !dmz.IsCloned(memfd) {
+		return nil, fmt.Errorf("cloned memfd is not properly sealed")
+	}
+	return memfd, nil
+}
+
+func mount(path string) error {
+	memfdFile, err := memfdClone(path)
+	if err != nil {
+		return fmt.Errorf("memfd clone: %w", err)
+	}
+	defer memfdFile.Close()
+	memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd())
+
+	// We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we
+	// cannot bind-mount the memfd itself (it's in the internal kernel mount
+	// namespace and cross-mount-namespace bind-mounts are not allowed). This
+	// also requires that this program stay alive continuously for the
+	// magic-link to stay alive...
+	memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err)
+	}
+	defer memfdLink.Close()
+	memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd())
+
+	exeFile, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("mount: failed to open target runc binary path: %w", err)
+	}
+	defer exeFile.Close()
+	exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd())
+
+	err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "")
+	if err != nil {
+		return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err)
+	}
+
+	// If there is a signal we want to do cleanup.
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, os.Interrupt, unix.SIGTERM, unix.SIGINT)
+	go func() {
+		<-sigCh
+		logrus.Infof("memfd-bind: exit signal caught! cleaning up the bind-mount on %q...", path)
+		_ = cleanup(path)
+		os.Exit(0)
+	}()
+
+	// Clean up things we don't need...
+	_ = exeFile.Close()
+	_ = memfdLink.Close()
+
+	// We now have to stay alive to keep the magic-link alive...
+	logrus.Infof("memfd-bind: bind-mount of memfd over %q created -- looping forever!", path)
+	for {
+		// loop forever...
+		time.Sleep(time.Duration(1<<63 - 1))
+		// make sure the memfd isn't gc'd
+		runtime.KeepAlive(memfdFile)
+	}
+}
+
+func main() {
+	app := cli.NewApp()
+	app.Name = "memfd-bind"
+	app.Usage = usage
+
+	// Set version to be the same as runC.
+	var v []string
+	if version != "" {
+		v = append(v, version)
+	}
+	if gitCommit != "" {
+		v = append(v, "commit: "+gitCommit)
+	}
+	app.Version = strings.Join(v, "\n")
+
+	// Set the flags.
+	app.Flags = []cli.Flag{
+		cli.BoolFlag{
+			Name:  "cleanup",
+			Usage: "Do not create a new memfd-sealed file, only clean up an existing one at <path>.",
+		},
+		cli.BoolFlag{
+			Name:  "debug",
+			Usage: "Enable debug logging.",
+		},
+	}
+
+	app.Action = func(ctx *cli.Context) error {
+		args := ctx.Args()
+		if len(args) != 1 {
+			return errors.New("need to specify a single path to the runc binary")
+		}
+		path := ctx.Args()[0]
+
+		if ctx.Bool("debug") {
+			logrus.SetLevel(logrus.DebugLevel)
+		}
+
+		err := cleanup(path)
+		// We only care about cleanup errors when doing --cleanup.
+		if ctx.Bool("cleanup") {
+			return err
+		}
+		return mount(path)
+	}
+	if err := app.Run(os.Args); err != nil {
+		fmt.Fprintf(os.Stderr, "memfd-bind: %v\n", err)
+		os.Exit(1)
+	}
+}
diff --git a/contrib/cmd/memfd-bind/memfd-bind@.service b/contrib/cmd/memfd-bind/memfd-bind@.service
new file mode 100644
index 00000000000..591548ea4d9
--- /dev/null
+++ b/contrib/cmd/memfd-bind/memfd-bind@.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Manage memfd-bind of %I
+Documentation=https://github.com/opencontainers/runc
+
+[Service]
+Type=simple
+ExecStart=memfd-bind "%I"
+ExecStop=memfd-bind --cleanup "%I"
+
+[Install]
+WantedBy=multi-user.target

From 6be763eeaa89df33d54f865ffc21372262b4806e Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 19 Aug 2023 00:01:55 +1000
Subject: [PATCH 6/8] tests: integration: fix capability setting for
 CAP_DAC_OVERRIDE

Due to the way capabilities have to be set by runc, capabilities need to
be included in the inheritable and ambient sets anyway. Otherwise, the
container process would not have the correct privileges. This test only
functioned because adding CAP_DAC_OVERRIDE to the inherited,
permissible, and bounding sets means that only "runc init" has these
capabilities -- everything other than the bounding set is cleared on the
first execve(). This breaks with runc-dmz, but the behaviour was broken
from the outset.

Docker appears to not handle this properly at all (the logic for
capability sets changed with the introduction of ambient capabilities,
and while Docker was updated it seems the behaviour is still incorrect
for non-root users).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tests/integration/start_hello.bats | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats
index 87005484748..6fbb893e695 100644
--- a/tests/integration/start_hello.bats
+++ b/tests/integration/start_hello.bats
@@ -58,6 +58,8 @@ function teardown() {
 	# Enable CAP_DAC_OVERRIDE.
 	update_config '	  .process.capabilities.bounding += ["CAP_DAC_OVERRIDE"]
 			| .process.capabilities.effective += ["CAP_DAC_OVERRIDE"]
+			| .process.capabilities.inheritable += ["CAP_DAC_OVERRIDE"]
+			| .process.capabilities.ambient += ["CAP_DAC_OVERRIDE"]
 			| .process.capabilities.permitted += ["CAP_DAC_OVERRIDE"]'
 
 	runc run test_busybox

From f8348f64ae2789edd1a930b449cc7e11bbc54b0e Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 16 Sep 2023 15:28:52 +1000
Subject: [PATCH 7/8] tests: integration: add runc-dmz smoke tests

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/container_linux.go              |  2 ++
 tests/integration/helpers.bash               | 10 ++++--
 tests/integration/run.bats                   | 34 ++++++++++++++++++++
 tests/integration/seccomp-notify-compat.bats |  4 +--
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index e10ca2332ff..ae5d4fb46b4 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -524,6 +524,7 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 				// runc-dmz.
 				exePath = "/proc/self/exe"
 				p.clonedExes = append(p.clonedExes, dmzExe)
+				logrus.Debug("runc-dmz: using runc-dmz") // used for tests
 			} else if errors.Is(err, dmz.ErrNoDmzBinary) {
 				logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
 			} else if err != nil {
@@ -542,6 +543,7 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 			}
 			exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
 			p.clonedExes = append(p.clonedExes, safeExe)
+			logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests
 		}
 		// Just to make sure we don't run without protection.
 		if dmzExe == nil && safeExe == nil {
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
index cd08fb2459f..7e6399a47b8 100755
--- a/tests/integration/helpers.bash
+++ b/tests/integration/helpers.bash
@@ -646,12 +646,16 @@ function teardown_bundle() {
 	remove_parent
 }
 
-function requires_kernel() {
+function is_kernel_gte() {
 	local major_required minor_required
 	major_required=$(echo "$1" | cut -d. -f1)
 	minor_required=$(echo "$1" | cut -d. -f2)
-	if [[ "$KERNEL_MAJOR" -lt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -lt $minor_required) ]]; then
-		skip "requires kernel $1"
+	[[ "$KERNEL_MAJOR" -gt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -ge $minor_required) ]]
+}
+
+function requires_kernel() {
+	if ! is_kernel_gte "$@"; then
+		skip "requires kernel >= $1"
 	fi
 }
 
diff --git a/tests/integration/run.bats b/tests/integration/run.bats
index 9f1f1d8bc74..baf91fb00cd 100644
--- a/tests/integration/run.bats
+++ b/tests/integration/run.bats
@@ -126,3 +126,37 @@ function teardown() {
 	[ "$status" -eq 0 ]
 	[ "$output" = "410" ]
 }
+
+@test "runc run [runc-dmz]" {
+	runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	# We use runc-dmz if we can.
+	[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
+}
+
+@test "runc run [cap_sys_ptrace -> /proc/self/exe clone]" {
+	# Add CAP_SYS_PTRACE to the bounding set, the minimum needed to indicate a
+	# container process _could_ get CAP_SYS_PTRACE.
+	update_config '.process.capabilities.bounding += ["CAP_SYS_PTRACE"]'
+
+	runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	if [ "$EUID" -ne 0 ] && is_kernel_gte 4.10; then
+		# For Linux 4.10 and later, rootless containers will use runc-dmz
+		# because they are running in a user namespace. See isDmzBinarySafe().
+		[[ "$output" = *"runc-dmz: using runc-dmz"* ]]
+	else
+		# If the container has CAP_SYS_PTRACE and is not rootless, we use
+		# /proc/self/exe cloning.
+		[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+	fi
+}
+
+@test "RUNC_DMZ=legacy runc run [/proc/self/exe clone]" {
+	RUNC_DMZ=legacy runc --debug run test_hello
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"Hello World"* ]]
+	[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+}
diff --git a/tests/integration/seccomp-notify-compat.bats b/tests/integration/seccomp-notify-compat.bats
index 8d663edda51..6ca3449bffa 100644
--- a/tests/integration/seccomp-notify-compat.bats
+++ b/tests/integration/seccomp-notify-compat.bats
@@ -3,8 +3,8 @@
 load helpers
 
 function setup() {
-	if [[ "$KERNEL_MAJOR" -gt 5 || ("$KERNEL_MAJOR" -eq 5 && "$KERNEL_MINOR" -ge 6) ]]; then
-		skip "requires kernel less than 5.6"
+	if is_kernel_gte 5.6; then
+		skip "requires kernel < 5.6"
 	fi
 
 	requires arch_x86_64

From 90c8d36afe5c5b6ff181d27c1389990497ffc6ab Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 22 Sep 2023 15:51:36 +1000
Subject: [PATCH 8/8] dmz: use sendfile(2) when cloning /proc/self/exe

This results in a 5-20% speedup of dmz.CloneBinary(), depending on the
machine.

io.Copy:

  goos: linux
  goarch: amd64
  pkg: github.com/opencontainers/runc/libcontainer/dmz
  cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz
  BenchmarkCloneBinary
  BenchmarkCloneBinary-8               139           8075074 ns/op
  PASS
  ok      github.com/opencontainers/runc/libcontainer/dmz 2.286s

unix.Sendfile:

  goos: linux
  goarch: amd64
  pkg: github.com/opencontainers/runc/libcontainer/dmz
  cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz
  BenchmarkCloneBinary
  BenchmarkCloneBinary-8               192           6382121 ns/op
  PASS
  ok      github.com/opencontainers/runc/libcontainer/dmz 2.415s

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/dmz/cloned_binary_linux.go |  2 +-
 libcontainer/system/linux.go            | 40 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
index 133bdefaff7..db5e18a3260 100644
--- a/libcontainer/dmz/cloned_binary_linux.go
+++ b/libcontainer/dmz/cloned_binary_linux.go
@@ -179,7 +179,7 @@ func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, erro
 	if err != nil {
 		return nil, err
 	}
-	copied, err := io.Copy(file, src)
+	copied, err := system.Copy(file, src)
 	if err != nil {
 		file.Close()
 		return nil, fmt.Errorf("copy binary: %w", err)
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index 8f52496e9bb..318b6edfe81 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -5,6 +5,7 @@ package system
 
 import (
 	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"strconv"
@@ -174,3 +175,42 @@ func ExecutableMemfd(comment string, flags int) (*os.File, error) {
 	}
 	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
 }
+
+// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
+// both (*os.File) as an optimisation to make copies faster.
+func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
+	dstFile, _ := dst.(*os.File)
+	srcFile, _ := src.(*os.File)
+
+	if dstFile != nil && srcFile != nil {
+		fi, err := srcFile.Stat()
+		if err != nil {
+			goto fallback
+		}
+		size := fi.Size()
+		for size > 0 {
+			n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
+			if n > 0 {
+				size -= int64(n)
+				copied += int64(n)
+			}
+			if err == unix.EINTR {
+				continue
+			}
+			if err != nil {
+				if copied == 0 {
+					// If we haven't copied anything so far, we can safely just
+					// fallback to io.Copy. We could always do the fallback but
+					// it's safer to error out in the case of a partial copy
+					// followed by an error (which should never happen).
+					goto fallback
+				}
+				return copied, fmt.Errorf("partial sendfile copy: %w", err)
+			}
+		}
+		return copied, nil
+	}
+
+fallback:
+	return io.Copy(dst, src)
+}