From 64bb59f5920b15d886cb2be52aede641fd4a047b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 8 Jun 2020 01:24:30 +1000 Subject: [PATCH 1/2] nsenter: improve debug logging In order to make 'runc --debug' actually useful for debugging nsexec bugs, provide information about all the internal operations when in debug mode. [@kolyshkin: rebasing; fix formatting via indent for make validate to pass] Signed-off-by: Aleksa Sarai Signed-off-by: Kir Kolyshkin --- libcontainer/init_linux.go | 4 +- libcontainer/nsenter/nsexec.c | 292 +++++++++++++++++++++------------- tests/integration/debug.bats | 2 +- tests/integration/exec.bats | 2 +- 4 files changed, 186 insertions(+), 114 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 91fb794663b..d6b50ff3b60 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -35,8 +35,8 @@ const ( ) type pid struct { - Pid int `json:"pid"` - PidFirstChild int `json:"pid_first"` + Pid int `json:"stage2_pid"` + PidFirstChild int `json:"stage1_pid"` } // network is an internal struct used to setup container networks. diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index e33a60edd43..dbf1725a7e0 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -36,7 +36,7 @@ enum sync_t { SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ }; /* @@ -45,10 +45,14 @@ enum sync_t { */ #define CREATECGROUPNS 0x80 +#define STAGE_SETUP -1 /* longjmp() arguments. */ -#define JUMP_PARENT 0x00 -#define JUMP_CHILD 0xA0 -#define JUMP_INIT 0xA1 +#define STAGE_PARENT 0 +#define STAGE_CHILD 1 +#define STAGE_INIT 2 + +/* Stores the current stage of nsexec. */ +int current_stage = STAGE_SETUP; /* Assume the stack grows down, so arguments should be above it. */ struct clone_t { @@ -134,33 +138,41 @@ int setns(int fd, int nstype) } #endif -static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +static void write_log(const char *level, const char *format, ...) { - char message[1024] = { }; - + char *message = NULL, *stage = NULL; va_list args; + int ret; if (logfd < 0 || level == NULL) - return; + goto out; va_start(args, format); - if (vsnprintf(message, sizeof(message), format, args) < 0) - goto done; - - dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); -done: + ret = vasprintf(&message, format, args); va_end(args); -} + if (ret < 0) + goto out; + + if (current_stage == STAGE_SETUP) + stage = strdup("nsexec"); + else + ret = asprintf(&stage, "nsexec-%d", current_stage); + if (ret < 0) + goto out; -#define write_log(level, fmt, ...) \ - write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message); + +out: + free(message); + free(stage); +} /* XXX: This is ugly. */ static int syncfd = -1; #define bail(fmt, ...) \ do { \ - write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ + write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \ exit(1); \ } while(0) @@ -297,9 +309,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/uid_map", pid); + write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newuid map on %d", pid); } @@ -310,9 +324,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/gid_map", pid); + write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newgid map on %d", pid); } @@ -323,6 +339,7 @@ static void update_oom_score_adj(char *data, size_t len) if (data == NULL || len <= 0) return; + write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); if (write_file(data, len, "/proc/self/oom_score_adj") < 0) bail("failed to update /proc/self/oom_score_adj"); } @@ -507,7 +524,6 @@ void join_namespaces(char *nslist) char *namespace = strtok_r(nslist, ",", &saveptr); struct namespace_t { int fd; - int ns; char type[PATH_MAX]; char path[PATH_MAX]; } *namespaces = NULL; @@ -542,7 +558,7 @@ void join_namespaces(char *nslist) bail("failed to open %s", path); ns->fd = fd; - ns->ns = nsflag(namespace); + strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); @@ -555,12 +571,14 @@ void join_namespaces(char *nslist) */ for (i = 0; i < num; i++) { - struct namespace_t ns = namespaces[i]; + struct namespace_t *ns = &namespaces[i]; + int flag = nsflag(ns->type); - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); + if (setns(ns->fd, flag) < 0) + bail("failed to setns into %s namespace", ns->type); - close(ns.fd); + close(ns->fd); } free(namespaces); @@ -569,6 +587,14 @@ void join_namespaces(char *nslist) /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); +static inline int sane_kill(pid_t pid, int signum) +{ + if (pid > 0) + return kill(pid, signum); + else + return 0; +} + void nsexec(void) { int pipenum; @@ -605,7 +631,7 @@ void nsexec(void) if (write(pipenum, "", 1) != 1) bail("could not inform the parent we are past initial setup"); - write_log(DEBUG, "nsexec started"); + write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -629,6 +655,7 @@ void nsexec(void) * containers), which is the recommendation from the kernel folks. */ if (config.namespaces) { + write_log(DEBUG, "set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as non-dumpable"); } @@ -693,45 +720,49 @@ void nsexec(void) * -- Aleksa "what has my life come to?" Sarai */ - switch (setjmp(env)) { + current_stage = setjmp(env); + switch (current_stage) { /* * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and + * (stage 1: STAGE_CHILD) process and write its uid_map and * gid_map. That process will go on to create a new process, then * it will send us its PID which we will send to the bootstrap * process. */ - case JUMP_PARENT:{ + case STAGE_PARENT:{ int len; - pid_t child, first_child = -1; - bool ready = false; + pid_t stage1_pid = -1, stage2_pid = -1; + bool stage1_complete, stage2_complete; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-0"); /* Start the process of getting a container. */ - child = clone_parent(&env, JUMP_CHILD); - if (child < 0) - bail("unable to fork: child_func"); + write_log(DEBUG, "spawn stage-1"); + stage1_pid = clone_parent(&env, STAGE_CHILD); + if (stage1_pid < 0) + bail("unable to spawn stage-1"); - /* - * State machine for synchronisation with the children. - * - * Father only return when both child and grandchild are - * ready, so we can receive all possible error codes - * generated by children. - */ syncfd = sync_child_pipe[1]; close(sync_child_pipe[0]); - while (!ready) { + /* + * State machine for synchronisation with the children. We only + * return once both the child and grandchild are ready. + */ + write_log(DEBUG, "-> stage-1 synchronisation loop"); + stage1_complete = false; + while (!stage1_complete) { enum sync_t s; if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); + bail("failed to sync with stage-1: next state"); switch (s) { case SYNC_USERMAP_PLS: + write_log(DEBUG, "stage-1 requested userns mappings"); + /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're @@ -742,70 +773,78 @@ void nsexec(void) * For rootless multi-entry mapping, config.is_setgroup shall be true and * newuidmap/newgidmap shall be used. */ - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(child, SETGROUPS_DENY); + update_setgroups(stage1_pid, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); } break; case SYNC_RECVPID_PLS: - first_child = child; + write_log(DEBUG, "stage-1 requested pid to be forwarded"); - /* Get the init_func pid. */ - if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(first_child, SIGKILL); - bail("failed to sync with child: read(childpid)"); + /* Get the stage-2 pid. */ + if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: read(stage2_pid)"); } /* Send ACK. */ s = SYNC_RECVPID_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(first_child, SIGKILL); - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); } - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. + /* + * Send both the stage-1 and stage-2 pids back to runc. + * runc needs the stage-2 to continue process management, + * but because stage-1 was spawned with CLONE_PARENT we + * cannot reap it within stage-0 and thus we need to ask + * runc to reap the zombie for us. */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", - child, first_child); + write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", + stage1_pid, stage2_pid); + len = + dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, + stage2_pid); if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with runc: write(pid-JSON)"); } break; - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-1 complete"); + stage1_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-1 synchronisation loop"); /* Now sync with grandchild. */ - syncfd = sync_grandchild_pipe[1]; close(sync_grandchild_pipe[0]); - - ready = false; - while (!ready) { + write_log(DEBUG, "-> stage-2 synchronisation loop"); + stage2_complete = false; + while (!stage2_complete) { enum sync_t s; + write_log(DEBUG, "signalling stage-2 to run"); s = SYNC_GRANDCHILD; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with child: write(SYNC_GRANDCHILD)"); } @@ -813,27 +852,31 @@ void nsexec(void) bail("failed to sync with child: next state"); switch (s) { - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-2 complete"); + stage2_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-2 synchronisation loop"); + write_log(DEBUG, "<~ nsexec stage-0"); exit(0); } + break; /* * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). + * provided namespaces in the netlink payload and unshare all of + * the requested namespaces. If we've been asked to CLONE_NEWUSER, + * we will ask our parent (stage 0) to set up our user mappings + * for us. Then, we create a new child (stage 2: STAGE_INIT) for + * PID namespace. We then send the child's PID to our parent + * (stage 0). */ - case JUMP_CHILD:{ - pid_t child; + case STAGE_CHILD:{ + pid_t stage2_pid = -1; enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ @@ -842,11 +885,12 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-1"); /* * We need to setns first. We cannot do this earlier (in stage 0) * because of the fact that we forked to get here (the PID of - * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ if (config.namespaces) @@ -872,40 +916,50 @@ void nsexec(void) * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + write_log(DEBUG, "unshare user namespace"); if (unshare(CLONE_NEWUSER) < 0) bail("failed to unshare user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal our parent to hook us up. + * We need to set ourselves as dumpable temporarily so that the + * parent process can write to our procfs files. */ - - /* Switching is only necessary if we joined namespaces. */ if (config.namespaces) { + write_log(DEBUG, "temporarily set process as dumpable"); if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to temporarily set process as dumpable"); } + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal stage-0 to do the mapping for + * us. + */ + write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); /* ... wait for mapping ... */ - + write_log(DEBUG, "request stage-0 to map user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); - /* Switching is only necessary if we joined namespaces. */ + + /* Revert temporary re-dumpable setting. */ if (config.namespaces) { + write_log(DEBUG, "re-set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to re-set process as non-dumpable"); } /* Become root in the namespace proper. */ if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } + /* * Unshare all of the namespaces. Now, it should be noted that this * ordering might break in the future (especially with rootless @@ -916,8 +970,9 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ + write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); + bail("failed to unshare remaining namespaces (except cgroupns)"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -928,41 +983,45 @@ void nsexec(void) * which would break many applications and libraries, so we must fork * to actually enter the new PID namespace. */ - child = clone_parent(&env, JUMP_INIT); - if (child < 0) - bail("unable to fork: init_func"); + write_log(DEBUG, "spawn stage-2"); + stage2_pid = clone_parent(&env, STAGE_INIT); + if (stage2_pid < 0) + bail("unable to spawn stage-2"); /* Send the child to our parent, which knows what it's doing. */ + write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); s = SYNC_RECVPID_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); } - if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(childpid)"); + if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(stage2_pid)"); } /* ... wait for parent to get the pid ... */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); } if (s != SYNC_RECVPID_ACK) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); } - /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ + write_log(DEBUG, "<~ nsexec stage-1"); exit(0); } + break; /* * Stage 2: We're the final child process, and the only process that will @@ -970,7 +1029,7 @@ void nsexec(void) * final cleanup steps and then return to the Go runtime to allow * init_linux.go to run. */ - case JUMP_INIT:{ + case STAGE_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -985,6 +1044,7 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-2"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); @@ -1005,21 +1065,30 @@ void nsexec(void) bail("setgroups failed"); } - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + /* + * Wait until our topmost parent has finished cgroup setup in + * p.manager.Apply(). + * + * TODO(cyphar): Check if this code is actually needed because we + * should be in the cgroup even from stage-0, so + * waiting until now might not make sense. + */ if (config.cloneflags & CLONE_NEWCGROUP) { uint8_t value; if (read(pipenum, &value, sizeof(value)) != sizeof(value)) bail("read synchronisation value failed"); if (value == CREATECGROUPNS) { + write_log(DEBUG, "unshare cgroup namespace"); if (unshare(CLONE_NEWCGROUP) < 0) bail("failed to unshare cgroup namespace"); } else bail("received unknown synchronisation value"); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + bail("failed to sync with patent: write(SYNC_CHILD_FINISH)"); /* Close sync pipes. */ close(sync_grandchild_pipe[0]); @@ -1028,10 +1097,13 @@ void nsexec(void) nl_free(&config); /* Finish executing, let the Go runtime take over. */ + write_log(DEBUG, "<= nsexec container setup"); + write_log(DEBUG, "booting up go runtime ..."); return; } + break; default: - bail("unexpected jump value"); + bail("unknown stage '%d' for jump value", current_stage); } /* Should never be reached. */ diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats index b2f87cf5f59..333745e3969 100644 --- a/tests/integration/debug.bats +++ b/tests/integration/debug.bats @@ -11,7 +11,7 @@ function teardown() { } function check_debug() { - [[ "$*" == *"nsexec started"* ]] + [[ "$*" == *"nsexec container setup"* ]] [[ "$*" == *"child process in init()"* ]] [[ "$*" == *"init: closing the pipe to signal completion"* ]] } diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index f80754a821e..039041cc65a 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -138,7 +138,7 @@ function teardown() { } function check_exec_debug() { - [[ "$*" == *"nsexec started"* ]] + [[ "$*" == *"nsexec container setup"* ]] [[ "$*" == *"child process in init()"* ]] [[ "$*" == *"setns_init: about to exec"* ]] } From 928ef7afac24bfa574eb58fb9ea1d3cfaf0f965c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 24 Jun 2020 18:00:41 -0700 Subject: [PATCH 2/2] libct/nsenter: add json msg escaping Since the previous commit, some strings logged by write_log() contain a literal newline, which leads to errors like this one: > # time="2020-06-07T15:41:37Z" level=error msg="failed to decode \"{\\\"level\\\":\\\"debug\\\", \\\"msg\\\": \\\"nsexec-0[2265]: update /proc/2266/uid_map to '0 1000 1\\n\" to json: invalid character '\\n' in string literal" The fix is to escape such characters. Add a simple (as much as it can be) routine which implements JSON string escaping as required by RFC4627, section 2.5, plus escaping of DEL (0x7f) character (not required, but allowed by the standard, and usually done by tools such as jq). As much as I hate to code something like this, I was not able to find a ready to consume and decent C implementation (not using glib). Added a test case (and some additional asserts in C code, conditionally enabled by the test case) to make sure the implementation is correct. The test case have to live in a separate directory so we can use different C flags to compile the test, and use C from go test. [v2: try to simplify the code, add more tests] [v3: don't do exit(1), try returning an error instead] Signed-off-by: Kir Kolyshkin --- libcontainer/nsenter/escape.c | 142 +++++++++++++++++++++++ libcontainer/nsenter/nsexec.c | 4 + libcontainer/nsenter/test/escape.c | 1 + libcontainer/nsenter/test/escape.go | 53 +++++++++ libcontainer/nsenter/test/escape_test.go | 11 ++ 5 files changed, 211 insertions(+) create mode 100644 libcontainer/nsenter/escape.c create mode 120000 libcontainer/nsenter/test/escape.c create mode 100644 libcontainer/nsenter/test/escape.go create mode 100644 libcontainer/nsenter/test/escape_test.go diff --git a/libcontainer/nsenter/escape.c b/libcontainer/nsenter/escape.c new file mode 100644 index 00000000000..78e7e9f4893 --- /dev/null +++ b/libcontainer/nsenter/escape.c @@ -0,0 +1,142 @@ +#include +#include + +#ifdef ESCAPE_TEST +# include +# define test_assert(arg) assert(arg) +#else +# define test_assert(arg) +#endif + +#define DEL '\x7f' + +/* + * Poor man version of itoa with base=16 and input number from 0 to 15, + * represented by a char. Converts it to a single hex digit ('0' to 'f'). + */ +static char hex(char i) +{ + test_assert(i >= 0 && i < 16); + + if (i >= 0 && i < 10) { + return '0' + i; + } + if (i >= 10 && i < 16) { + return 'a' + i - 10; + } + return '?'; +} + +/* + * Given the character, tells how many _extra_ characters are needed + * to JSON-escape it. If 0 is returned, the character does not need to + * be escaped. + */ +static int need_escape(char c) +{ + switch (c) { + case '\\': + case '"': + case '\b': + case '\n': + case '\r': + case '\t': + case '\f': + return 1; + case DEL: // -> \u007f + return 5; + default: + if (c > 0 && c < ' ') { + // ASCII decimal 01 to 31 -> \u00xx + return 5; + } + return 0; + } +} + +/* + * Escape the string so it can be used as a JSON string (per RFC4627, + * section 2.5 minimal requirements, plus the DEL (0x7f) character). + * + * It is expected that the argument is a string allocated via malloc. + * In case no escaping is needed, the original string is returned as is; + * otherwise, the original string is free'd, and the newly allocated + * escaped string is returned. Thus, in any case, the value returned + * need to be free'd by the caller. + */ +char *escape_json_string(char *s) +{ + int i, j, len; + char *c, *out; + + /* + * First, check if escaping is at all needed -- if not, we can avoid + * malloc and return the argument as is. While at it, count how much + * extra space is required. + * + * XXX: the counting code must be in sync with the escaping code + * (checked by test_assert()s below). + */ + for (i = j = 0; s[i] != '\0'; i++) { + j += need_escape(s[i]); + } + if (j == 0) { + // nothing to escape + return s; + } + + len = i + j + 1; + out = malloc(len); + if (!out) { + free(s); + // As malloc failed, strdup can fail, too, so in the worst case + // scenario NULL will be returned from here. + return strdup("escape_json_string: out of memory"); + } + for (c = s, j = 0; *c != '\0'; c++) { + switch (*c) { + case '"': + case '\\': + test_assert(need_escape(*c) == 1); + out[j++] = '\\'; + out[j++] = *c; + continue; + } + if ((*c < 0 || *c >= ' ') && (*c != DEL)) { + // no escape needed + test_assert(need_escape(*c) == 0); + out[j++] = *c; + continue; + } + out[j++] = '\\'; + switch (*c) { + case '\b': + out[j++] = 'b'; + break; + case '\n': + out[j++] = 'n'; + break; + case '\r': + out[j++] = 'r'; + break; + case '\t': + out[j++] = 't'; + break; + case '\f': + out[j++] = 'f'; + break; + default: + test_assert(need_escape(*c) == 5); + out[j++] = 'u'; + out[j++] = '0'; + out[j++] = '0'; + out[j++] = hex(*c >> 4); + out[j++] = hex(*c & 0x0f); + } + } + test_assert(j + 1 == len); + out[j] = '\0'; + + free(s); + return out; +} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index dbf1725a7e0..bee0042942c 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -29,6 +29,8 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +extern char *escape_json_string(char *str); + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -153,6 +155,8 @@ static void write_log(const char *level, const char *format, ...) if (ret < 0) goto out; + message = escape_json_string(message); + if (current_stage == STAGE_SETUP) stage = strdup("nsexec"); else diff --git a/libcontainer/nsenter/test/escape.c b/libcontainer/nsenter/test/escape.c new file mode 120000 index 00000000000..c53e316a22c --- /dev/null +++ b/libcontainer/nsenter/test/escape.c @@ -0,0 +1 @@ +../escape.c \ No newline at end of file diff --git a/libcontainer/nsenter/test/escape.go b/libcontainer/nsenter/test/escape.go new file mode 100644 index 00000000000..4accf967a47 --- /dev/null +++ b/libcontainer/nsenter/test/escape.go @@ -0,0 +1,53 @@ +package escapetest + +// This file is part of escape_json_string unit test. +// It is in a separate package so cgo can be used together +// with go test. + +// #include +// extern char *escape_json_string(char *str); +// #cgo CFLAGS: -DESCAPE_TEST=1 +import "C" + +import ( + "testing" + "unsafe" +) + +func testEscapeJsonString(t *testing.T, input, want string) { + in := C.CString(input) + out := C.escape_json_string(in) + got := C.GoString(out) + C.free(unsafe.Pointer(out)) + t.Logf("input: %q, output: %q", input, got) + if got != want { + t.Errorf("Failed on input: %q, want %q, got %q", input, want, got) + } +} + +func testEscapeJson(t *testing.T) { + testCases := []struct { + input, output string + }{ + {"", ""}, + {"abcdef", "abcdef"}, + {`\\\\\\`, `\\\\\\\\\\\\`}, + {`with"quote`, `with\"quote`}, + {"\n\r\b\t\f\\", `\n\r\b\t\f\\`}, + {"\007", "\\u0007"}, + {"\017 \020 \037", "\\u000f \\u0010 \\u001f"}, + {"\033", "\\u001b"}, + {`<->`, `<->`}, + {"\176\177\200", "~\\u007f\200"}, + {"\000", ""}, + {"a\x7fxc", "a\\u007fxc"}, + {"a\033xc", "a\\u001bxc"}, + {"a\nxc", "a\\nxc"}, + {"a\\xc", "a\\\\xc"}, + {"Barney B\303\244r", "Barney B\303\244r"}, + } + + for _, tc := range testCases { + testEscapeJsonString(t, tc.input, tc.output) + } +} diff --git a/libcontainer/nsenter/test/escape_test.go b/libcontainer/nsenter/test/escape_test.go new file mode 100644 index 00000000000..eefd5ecf11a --- /dev/null +++ b/libcontainer/nsenter/test/escape_test.go @@ -0,0 +1,11 @@ +package escapetest + +import "testing" + +// The actual test function is in escape.go +// so that it can use cgo (import "C"). +// This wrapper is here for gotest to find. + +func TestEscapeJson(t *testing.T) { + testEscapeJson(t) +}