Skip to content

Commit

Permalink
chore: write log shipper in python
Browse files Browse the repository at this point in the history
Instead of orchestrating multiple processes with bash and
confusing/complex process substitution, with all the trapping of signals
and `printf x > ...` nonsense, just write a wrapper process in a real
programming language.  This reduces the complexity of our logging
solution, and prevents the out-of-rder bugs inherent in having two
separate log shippers, one for stdout and one for stderr.

The new ship_logs.py has the following features:
- it launches and monitors a child command
- it shares a log buffer for shipping stdout and stderr
- it has the same log parsing regexes as enrich_logging.py
- it converts carriage returns to newlines
- it forwards signals to its child process
- it exits after a maximum of DET_LOG_WAIT_TIME or 30 seconds
- it depends on the python interpreter, but only the interpreter (all
  imports are from the standard library)
- in the special case that the child process can't be started, it ships
  an explanation of what happened to the master and exits with standard
  bash exit codes
  • Loading branch information
rb-determined-ai committed Sep 27, 2023
1 parent e031a2f commit 258a09d
Show file tree
Hide file tree
Showing 15 changed files with 573 additions and 367 deletions.
2 changes: 0 additions & 2 deletions master/internal/rm/kubernetesrm/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,6 @@ func (p *pod) createPodSpec(scheduler string) error {

var sidecars []k8sV1.Container

envVars = append(envVars, k8sV1.EnvVar{Name: "DET_K8S_LOG_TO_FILE", Value: "true"})

container := k8sV1.Container{
Name: model.DeterminedK8ContainerName,
Command: spec.Entrypoint,
Expand Down
6 changes: 4 additions & 2 deletions master/pkg/etc/etc.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ const (
NotebookIdleCheckResource = "check_idle.py"
// TaskCheckReadyLogsResource is the script to parse logs to check if a task is ready.
TaskCheckReadyLogsResource = "check_ready_logs.py"
// TaskEnrichLogsResource is the script to enrich logs for slurm (which doesn't run fluent).
TaskEnrichLogsResource = "enrich_task_logs.py"
// TaskShipLogsShellResource is the shell script to call the python script to ship logs.
TaskShipLogsShellResource = "ship_logs.sh"
// TaskShipLogsPythonResource is the python script to ship logs.
TaskShipLogsPythonResource = "ship_logs.py"
// TensorboardEntryScriptResource is the script to set up TensorBoard.
TensorboardEntryScriptResource = "tensorboard-entrypoint.sh"
// TrialEntrypointScriptResource is the script to set up a trial.
Expand Down
7 changes: 5 additions & 2 deletions master/pkg/tasks/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ const (
taskSignalHandlingScript = "task-signal-handling.sh"
taskSignalHandlingMode = 0o744

taskEnrichLogsScript = "enrich_task_logs.py"
taskEnrichLogsScriptMode = 0o744
taskShipLogsShell = "ship_logs.sh"
taskShipLogsShellMode = 0o755

taskShipLogsPython = "ship_logs.py"
taskShipLogsPythonMode = 0o755

// Put as many ssh-related files in /run/determined as possible. In particular, it is very
// important that we don't overwrite the user's host $HOME/.ssh/id_rsa, if the user happens to
Expand Down
26 changes: 13 additions & 13 deletions master/pkg/tasks/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"strings"
"path/filepath"

docker "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/mount"
Expand Down Expand Up @@ -257,6 +258,11 @@ func (t *TaskSpec) ToDockerSpec() cproto.Spec {
})
}

// Prepend the entrypoint like: `ship_logs.sh "$@"`.
shipLogsShell := filepath.Join(RunDir, taskShipLogsShell)
shipLogsPython := filepath.Join(RunDir, taskShipLogsPython)
entrypoint := append([]string{shipLogsShell, shipLogsPython}, t.Entrypoint...)

runArchives, rootArchives := t.Archives()
spec := cproto.Spec{
TaskType: string(t.TaskType),
Expand All @@ -269,7 +275,7 @@ func (t *TaskSpec) ToDockerSpec() cproto.Spec {
User: getUser(t.AgentUserGroup),
ExposedPorts: toPortSet(env.Ports()),
Env: envVars,
Cmd: t.Entrypoint,
Cmd: entrypoint,
Image: env.Image().For(deviceType),
WorkingDir: t.WorkDir,
},
Expand Down Expand Up @@ -319,21 +325,15 @@ func runDirHelpersArchive(aug *model.AgentUserGroup) cproto.RunArchive {
tar.TypeReg,
),
aug.OwnedArchiveItem(
taskEnrichLogsScript,
etc.MustStaticFile(etc.TaskEnrichLogsResource),
taskEnrichLogsScriptMode,
tar.TypeReg,
),
aug.OwnedArchiveItem(
taskLoggingTeardownScript,
etc.MustStaticFile(etc.TaskLoggingTeardownScriptResource),
taskLoggingTeardownMode,
taskShipLogsShell,
etc.MustStaticFile(etc.TaskShipLogsShellResource),
taskShipLogsShellMode,
tar.TypeReg,
),
aug.OwnedArchiveItem(
taskSignalHandlingScript,
etc.MustStaticFile(etc.TaskSignalHandlingScriptResource),
taskSignalHandlingMode,
taskShipLogsPython,
etc.MustStaticFile(etc.TaskShipLogsPythonResource),
taskShipLogsPythonMode,
tar.TypeReg,
),
aug.OwnedArchiveItem(
Expand Down
7 changes: 2 additions & 5 deletions master/static/srv/command-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash

source /run/determined/task-signal-handling.sh
source /run/determined/task-logging-setup.sh

set -e
Expand All @@ -15,10 +14,8 @@ fi
# to register the proxy with the Determined master.
"$DET_PYTHON_EXECUTABLE" -m determined.exec.prep_container --proxy

trap_and_forward_signals
if [ "$#" -eq 1 ]; then
/bin/sh -c "$@" &
exec /bin/sh -c "$@"
else
"$@" &
exec "$@"
fi
wait_and_handle_signals $!
206 changes: 0 additions & 206 deletions master/static/srv/enrich_task_logs.py

This file was deleted.

5 changes: 1 addition & 4 deletions master/static/srv/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash

source /run/determined/task-signal-handling.sh
source /run/determined/task-logging-setup.sh

set -e
Expand Down Expand Up @@ -34,6 +33,4 @@ set +x
# Do rendezvous last, to ensure all launch layers start around the same time.
"$DET_PYTHON_EXECUTABLE" -m determined.exec.prep_container --rendezvous

trap_and_forward_signals
"$DET_PYTHON_EXECUTABLE" -m determined.exec.launch "$@" &
wait_and_handle_signals $!
exec "$DET_PYTHON_EXECUTABLE" -m determined.exec.launch "$@"
5 changes: 1 addition & 4 deletions master/static/srv/gc-checkpoints-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash

source /run/determined/task-signal-handling.sh
source /run/determined/task-logging-setup.sh

set -e
Expand All @@ -12,6 +11,4 @@ fi

"$DET_PYTHON_EXECUTABLE" -m determined.exec.prep_container

trap_and_forward_signals
"$DET_PYTHON_EXECUTABLE" -m determined.exec.gc_checkpoints "$@" &
wait_and_handle_signals $!
exec "$DET_PYTHON_EXECUTABLE" -m determined.exec.gc_checkpoints "$@"
5 changes: 1 addition & 4 deletions master/static/srv/notebook-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash

source /run/determined/task-signal-handling.sh
source /run/determined/task-logging-setup.sh

set -e
Expand Down Expand Up @@ -43,8 +42,7 @@ set +x
JUPYTER_LAB_LOG_FORMAT="%(levelname)s: [%(name)s] %(message)s"
READINESS_REGEX='^.*Jupyter Server .* is running.*$'

trap_and_forward_signals
jupyter lab --ServerApp.port=${NOTEBOOK_PORT} \
exec jupyter lab --ServerApp.port=${NOTEBOOK_PORT} \
--ServerApp.allow_origin="*" \
--ServerApp.base_url="/proxy/${DET_TASK_ID}/" \
--ServerApp.allow_root=True \
Expand All @@ -61,4 +59,3 @@ jupyter lab --ServerApp.port=${NOTEBOOK_PORT} \
--LabApp.log_format="$JUPYTER_LAB_LOG_FORMAT" \
--ServerApp.log_format="$JUPYTER_LAB_LOG_FORMAT" \
2> >(tee -p >("$DET_PYTHON_EXECUTABLE" /run/determined/check_ready_logs.py --ready-regex "${READINESS_REGEX}") >&2)
wait_and_handle_signals $!
5 changes: 1 addition & 4 deletions master/static/srv/shell-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env bash

source /run/determined/task-signal-handling.sh
source /run/determined/task-logging-setup.sh

set -e
Expand Down Expand Up @@ -93,7 +92,5 @@ chmod 600 "$modified"

READINESS_REGEX="Server listening on"

trap_and_forward_signals
/usr/sbin/sshd "$@" \
2> >(tee -p >("$DET_PYTHON_EXECUTABLE" /run/determined/check_ready_logs.py --ready-regex "$READINESS_REGEX") >&2) &
wait_and_handle_signals $!
2> >(tee -p >("$DET_PYTHON_EXECUTABLE" /run/determined/check_ready_logs.py --ready-regex "$READINESS_REGEX") >&2)
Loading

0 comments on commit 258a09d

Please sign in to comment.