From 25dd59da4de0b54b0f0b66b11c19c13e4b202486 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Mon, 31 Aug 2020 10:49:05 -0700 Subject: [PATCH] adding documentation --- cli/determined_cli/tensorboard.py | 10 ++++--- docs/how-to/tensorboard.txt | 26 +++++++++++++++++ docs/reference/command-notebook-config.txt | 10 ++++++- .../internal/command/tensorboard_manager.go | 7 +++-- tools/run-server.py | 28 +++++++++---------- 5 files changed, 60 insertions(+), 21 deletions(-) diff --git a/cli/determined_cli/tensorboard.py b/cli/determined_cli/tensorboard.py index 14aa5a349731..2c54ab44d3c2 100644 --- a/cli/determined_cli/tensorboard.py +++ b/cli/determined_cli/tensorboard.py @@ -10,7 +10,7 @@ from determined_common.check import check_eq from . import render -from .command import Command, launch_command, parse_config, render_event_stream +from .command import Command, parse_config, render_event_stream from .declarative_argparse import Arg, Cmd Tensorboard = namedtuple( @@ -33,14 +33,16 @@ def to_tensorboard(command: Command) -> Tensorboard: @authentication_required def start_tensorboard(args: Namespace) -> None: - # this is the place where you add some stuff related to importing the config - # think about if you need to add the option for manual config options if args.trial_ids is None and args.experiment_ids is None: print("Either experiment_ids or trial_ids must be specified.") sys.exit(1) config = parse_config(args.config_file, None, [], []) - req_body = {"config": config, "trial_ids": args.trial_ids, "experiment_ids": args.experiment_ids} + req_body = { + "config": config, + "trial_ids": args.trial_ids, + "experiment_ids": args.experiment_ids, + } resp = api.post(args.master, "tensorboard", body=req_body).json() if args.detach: diff --git a/docs/how-to/tensorboard.txt b/docs/how-to/tensorboard.txt index 31b0db007219..0b91e2d23ecc 100644 --- a/docs/how-to/tensorboard.txt +++ b/docs/how-to/tensorboard.txt @@ -51,6 +51,32 @@ TensorBoard for multiple experiments use metrics from persistent storage. It may take up to 5 minutes for TensorBoard to receive data and render visualizations. +Customizing Tensorboards +------------------------ + +Determined supports initializing TensorBoard with a YAML configuration file. +For example, this feature can be useful for running TensorBoard with a +specific container image or for enabling access to additional data with a +bind-mount. + +.. code:: yaml + + environment: + image: determinedai/environments:cuda-10.0-pytorch-1.4-tf-1.15-cpu-0.5.0 + bind_mounts: + - host_path: /my/agent/path + container_path: /my/container/path + read_only: true + +Details of configuration settings can be found in the +:ref:`command-notebook-configuration`. + +To launch Tensorboard with a config file, use +``det tensorboard start --config-file=my_config.yaml``. + +To view the configuration of a running Tensorboard instance, use +``det tensorboard config ``. + Analyzing Specific Trials ------------------------- diff --git a/docs/reference/command-notebook-config.txt b/docs/reference/command-notebook-config.txt index b87e37f42467..b806c5401f57 100644 --- a/docs/reference/command-notebook-config.txt +++ b/docs/reference/command-notebook-config.txt @@ -24,6 +24,13 @@ when the workload is launched: Options set via ``--config`` take precedence over values specified in the configuration file. +Tensorboard workloads also support YAML configuration files, but do not +allow configuration variables to be passed directly to it: + +.. code:: + + $ det tensorboard start experiment_id --config-file=my_config.yaml + Configuration Settings ********************** @@ -69,7 +76,8 @@ The following configuration settings are supported: number of slots on the agent in the cluster with the most slots. For example, Determined will be unable to schedule a command that requests 4 slots if the Determined cluster is composed of agents with 2 - slots each. + slots each. The number of slots for Tensorboard is fixed at ``0`` and + may not be changed. - ``agent_label``: If set, the command/notebook will _only_ be scheduled on agents that have the given label set. If this is not set (the default diff --git a/master/internal/command/tensorboard_manager.go b/master/internal/command/tensorboard_manager.go index be6fc38e32ac..fe6e21dce146 100644 --- a/master/internal/command/tensorboard_manager.go +++ b/master/internal/command/tensorboard_manager.go @@ -299,8 +299,11 @@ func (t *tensorboardManager) newTensorBoard( ) config.Entrypoint = []string{tensorboardEntrypointFile, "--logdir", strings.Join(logDirs, ",")} config.Resources.Slots = tensorboardResourcesSlots - config.Environment.EnvironmentVariables = model.RuntimeItems{CPU: envVars, GPU: envVars} - config.BindMounts = getMounts(uniqMounts) + + cpuEnvVars := append(config.Environment.EnvironmentVariables.CPU, envVars...) + gpuEnvVars := append(config.Environment.EnvironmentVariables.GPU, envVars...) + config.Environment.EnvironmentVariables = model.RuntimeItems{CPU: cpuEnvVars, GPU: gpuEnvVars} + config.BindMounts = append(config.BindMounts, getMounts(uniqMounts)...) setPodSpec(&config, t.taskContainerDefaults) diff --git a/tools/run-server.py b/tools/run-server.py index 291dbba3a312..c26edf85462b 100644 --- a/tools/run-server.py +++ b/tools/run-server.py @@ -50,7 +50,7 @@ def tail_db_logs(): def run_master(): return proc( "master", - ["../master/build/determined-master", "--config-file", "/usr/local/determined/etc/master.yaml"], + ["../master/build/determined-master", "--config-file", "master.yaml"], logs_handler=lambda line: f"{MAGENTA}determined-master |{CLEAR} {line}" ) @@ -83,26 +83,26 @@ def main(): db, master, agent, db_logs = False, None, None, None try: master = run_master() - # agent = run_agent() - # db_logs = tail_db_logs() - # if not is_db_running(): - # db = True - # subprocess.check_call(["docker-compose", "up", "-d"]) - - # wait_for_server(5432) - # db_logs.start() + agent = run_agent() + db_logs = tail_db_logs() + if not is_db_running(): + db = True + subprocess.check_call(["docker-compose", "up", "-d"]) + + wait_for_server(5432) + db_logs.start() master.start() wait_for_server(8080) - # agent.start() + agent.start() # Join the agent first so we can exit if the agent fails to connect to # the master. - # agent.join() - # if agent.exitcode != 0: - # raise Exception(f"agent failed with non-zero exit code {agent.exitcode}") + agent.join() + if agent.exitcode != 0: + raise Exception(f"agent failed with non-zero exit code {agent.exitcode}") master.join() - # db_logs.join() + db_logs.join() except KeyboardInterrupt: pass finally: