Skip to content

Commit

Permalink
merge pr #3599 into opencontainers/runc:main
Browse files Browse the repository at this point in the history
Cory Snider (5):
  libct/nsenter: namespace the bindfd shuffle
  libct/nsenter: set FD_CLOEXEC on received fd
  libct/nsenter: refactor ipc funcs for reusability
  libct/nsenter: annotate write_log() prototype
  chore(libct/nsenter): extract utility code

LGTMs: AkihiroSuda kolyshkin cyphar
Closes #3599
  • Loading branch information
cyphar committed Jul 4, 2023
2 parents 164e4bc + 017d699 commit 35eff7c
Show file tree
Hide file tree
Showing 8 changed files with 433 additions and 244 deletions.
186 changes: 165 additions & 21 deletions libcontainer/nsenter/cloned_binary.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,20 @@
#include <fcntl.h>
#include <errno.h>

#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/vfs.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/sendfile.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/wait.h>

#include "ipc.h"
#include "log.h"

/* Use our own wrapper for memfd_create. */
#ifndef SYS_memfd_create
Expand Down Expand Up @@ -394,6 +400,123 @@ static int seal_execfd(int *fd, int fdtype)
return -1;
}

struct bindfd_child_args {
int sockfd;
const char *mount_target;
};

static int bindfd_in_subprocess(void *arg)
{
/*
* In the interests of efficiency (read: minimizing the syscall count)
* and conciseness, no attempt is made to release resources which would
* be cleaned up automatically on process exit, i.e. when this function
* returns. This includes filesystem mounts, as this function is
* executed in a dedicated mount namespace.
*/

/*
* For obvious reasons this won't work in rootless mode because we
* haven't created a userns -- but getting that to work will be a bit
* complicated and it's only worth doing if someone actually needs it.
*/
if (mount("none", "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
return errno;
/*
* The kernel resolves the magic symlink /proc/self/exe to the real file
* _in the original mount namespace_. Cross-namespace bind mounts are
* not allowed, so we must locate the file inside the current mount
* namespace to be able to bind-mount it. (The mount(8) command resolves
* symlinks, which is why it appears to work at first glance.)
*/
char linkbuf[PATH_MAX + 1] = { 0 };
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
if (linkpathlen < 0)
return errno;
if (linkpathlen == sizeof(linkbuf)) {
/*
* The link path is longer than PATH_MAX, and the contents of
* linkbuf might have been truncated. A truncated path could
* happen to be a valid path to a different file, which could
* allow for local privilege escalation if we were to exec it.
* The mount syscall doesn't accept paths longer than PATH_MAX,
* anyway.
*/
return ENAMETOOLONG;
}

int srcfd = open(linkbuf, O_PATH | O_CLOEXEC);
if (srcfd < 0)
return errno;
/*
* linkbuf holds the path to the binary which the parent process was
* launched from. Someone could have moved a different file to that path
* in the interim, in which case srcfd is not the file we want to
* bind-mount. Guard against this situation by verifying srcfd is the
* same file as /proc/self/exe.
*/
struct stat realexe = { 0 };
if (stat("/proc/self/exe", &realexe) < 0)
return errno;
struct stat resolved = { 0 };
if (fstat(srcfd, &resolved) < 0)
return errno;
if (resolved.st_dev != realexe.st_dev || resolved.st_ino != realexe.st_ino)
return ENOENT;
if (snprintf(linkbuf, sizeof(linkbuf), "/proc/self/fd/%d", srcfd) == sizeof(linkbuf))
return ENAMETOOLONG;

const struct bindfd_child_args *args = arg;
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
return errno;
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
return errno;

int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
if (fd < 0)
return errno;

/*
* Make sure the MNT_DETACH works, otherwise we could get remounted
* read-write and that would be quite bad.
*/
if (umount2(args->mount_target, MNT_DETACH) < 0)
return errno;

if (send_fd(args->sockfd, fd) < 0)
return errno;
return 0;
}

static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
static int spawn_bindfd_child(const struct bindfd_child_args *args)
{
/*
* Carve out a chunk of our call stack for the child process to use as
* we can be sure it is correctly mapped for use as stack. (Technically
* only the libc clone() wrapper writes to this buffer. The child
* process operates on a copy of the parent's virtual memory space and
* so can safely overflow into the rest of the stack memory region
* without consequence.)
*/
char stack[4 * 1024] __attribute__((aligned(16)));
int tid = clone(bindfd_in_subprocess,
/*
* Assume stack grows down, as HP-PA, the only Linux
* platform where stack grows up, is obsolete.
*/
stack + sizeof(stack),
/*
* Suspend the parent process until the child has exited to
* save an unnecessary context switch as we'd just be
* waiting for the child process to exit anyway.
*/
CLONE_NEWNS | CLONE_VFORK, (void *)args);
if (tid < 0)
return -errno;
return tid;
}

static int try_bindfd(void)
{
int fd, ret = -1;
Expand All @@ -415,32 +538,53 @@ static int try_bindfd(void)
close(fd);

/*
* For obvious reasons this won't work in rootless mode because we haven't
* created a userns+mntns -- but getting that to work will be a bit
* complicated and it's only worth doing if someone actually needs it.
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
* re-parse it on every change, which gets expensive when the mount table
* is large and/or changes frequently. Perform the mount operations in a
* new, private mount namespace so as not to wake up those processes
* every time we nsexec into a container. We clone a child process into
* a new mount namespace to do the dirty work so the side effects of
* unsharing the mount namespace do not leak into the current process.
*/
ret = -EPERM;
if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
goto out;
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
goto out_umount;
int sock[2];
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0) {
ret = -errno;
goto cleanup_unlink;
}

/* Get read-only handle that we're sure can't be made read-write. */
ret = open(template, O_PATH | O_CLOEXEC);
struct bindfd_child_args args = {
.sockfd = sock[0],
.mount_target = template,
};
int cpid = spawn_bindfd_child(&args);
close(sock[0]);
if (cpid < 0) {
ret = cpid;
goto cleanup_socketpair;
}

out_umount:
/*
* Make sure the MNT_DETACH works, otherwise we could get remounted
* read-write and that would be quite bad (the fd would be made read-write
* too, invalidating the protection).
*/
if (umount2(template, MNT_DETACH) < 0) {
if (ret >= 0)
close(ret);
ret = -ENOTRECOVERABLE;
int wstatus = 0;
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
bail("error waiting for bindfd child process to exit");
if (WIFEXITED(wstatus)) {
if (WEXITSTATUS(wstatus)) {
ret = -WEXITSTATUS(wstatus);
goto cleanup_socketpair;
}
} else if (WIFSIGNALED(wstatus)) {
int sig = WTERMSIG(wstatus);
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
} else {
/* Should never happen... */
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
}

out:
ret = receive_fd(sock[1]);

cleanup_socketpair:
close(sock[1]);

cleanup_unlink:
/*
* We don't care about unlink errors, the worst that happens is that
* there's an empty file left around in STATEDIR.
Expand Down
27 changes: 27 additions & 0 deletions libcontainer/nsenter/getenv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#define _GNU_SOURCE
#include <errno.h>
#include <stdlib.h>
#include "getenv.h"
#include "log.h"

int getenv_int(const char *name)
{
char *val, *endptr;
int ret;

val = getenv(name);
/* Treat empty value as unset variable. */
if (val == NULL || *val == '\0')
return -ENOENT;

ret = strtol(val, &endptr, 10);
if (val == endptr || *endptr != '\0')
bail("unable to parse %s=%s", name, val);
/*
* Sanity check: this must be a non-negative number.
*/
if (ret < 0)
bail("bad value for %s=%s (%d)", name, val, ret);

return ret;
}
13 changes: 13 additions & 0 deletions libcontainer/nsenter/getenv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#ifndef NSENTER_GETENV_H
#define NSENTER_GETENV_H

/*
* Returns an environment variable value as a non-negative integer, or -ENOENT
* if the variable was not found or has an empty value.
*
* If the value can not be converted to an integer, or the result is out of
* range, the function bails out.
*/
int getenv_int(const char *name);

#endif /* NSENTER_GETENV_H */
84 changes: 84 additions & 0 deletions libcontainer/nsenter/ipc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#define _GNU_SOURCE
#include <alloca.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include "ipc.h"
#include "log.h"

int receive_fd(int sockfd)
{
int bytes_read;
struct msghdr msg = { };
struct cmsghdr *cmsg;
struct iovec iov = { };
char null_byte = '\0';
int ret;
int fd_count;

iov.iov_base = &null_byte;
iov.iov_len = 1;

msg.msg_iov = &iov;
msg.msg_iovlen = 1;

msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = malloc(msg.msg_controllen);
if (msg.msg_control == NULL) {
bail("Can't allocate memory to receive fd.");
}

memset(msg.msg_control, 0, msg.msg_controllen);

bytes_read = recvmsg(sockfd, &msg, MSG_CMSG_CLOEXEC);
if (bytes_read != 1)
bail("failed to receive fd from unix socket %d", sockfd);
if (msg.msg_flags & MSG_CTRUNC)
bail("received truncated control message from unix socket %d", sockfd);

cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
bail("received message from unix socket %d without control message", sockfd);

if (cmsg->cmsg_level != SOL_SOCKET)
bail("received unknown control message from unix socket %d: cmsg_level=%d", sockfd, cmsg->cmsg_level);

if (cmsg->cmsg_type != SCM_RIGHTS)
bail("received unknown control message from unix socket %d: cmsg_type=%d", sockfd, cmsg->cmsg_type);

fd_count = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
if (fd_count != 1)
bail("received control message from unix socket %d with too many fds: %d", sockfd, fd_count);

ret = *(int *)CMSG_DATA(cmsg);
free(msg.msg_control);
return ret;
}

int send_fd(int sockfd, int fd)
{
struct msghdr msg = { };
struct cmsghdr *cmsg;
struct iovec iov[1] = { };
char null_byte = '\0';

iov[0].iov_base = &null_byte;
iov[0].iov_len = 1;

msg.msg_iov = iov;
msg.msg_iovlen = 1;

/* We send only one fd as specified by cmsg->cmsg_len below, even
* though msg.msg_controllen might have more space due to alignment. */
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = alloca(msg.msg_controllen);
memset(msg.msg_control, 0, msg.msg_controllen);

cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));

return sendmsg(sockfd, &msg, 0);
}
12 changes: 12 additions & 0 deletions libcontainer/nsenter/ipc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef NSENTER_IPC_H
#define NSENTER_IPC_H

int receive_fd(int sockfd);

/*
* send_fd passes the open file descriptor fd to another process via the UNIX
* domain socket sockfd. The return value of the sendmsg(2) call is returned.
*/
int send_fd(int sockfd, int fd);

#endif /* NSENTER_IPC_H */
Loading

0 comments on commit 35eff7c

Please sign in to comment.