From 03c6727eb6d572fa18e2c588f145c53248a0f07c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 20 Aug 2020 08:56:12 -0700 Subject: [PATCH] docs: improve docs for tensorboard_timeout. (#1124) Document default value, update TensorBoard how-to. --- docs/how-to/tensorboard.txt | 16 +++++++++++----- docs/reference/cluster-config.txt | 8 +++++--- master/internal/command/tensorboard_manager.go | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/docs/how-to/tensorboard.txt b/docs/how-to/tensorboard.txt index ca5a91229c3..f2d1563811c 100644 --- a/docs/how-to/tensorboard.txt +++ b/docs/how-to/tensorboard.txt @@ -142,9 +142,15 @@ add a :class:`~determined.tensorpack.TFEventWriter` callback to your trial: Lifecycle Management -------------------- -Once a new TensorBoard instance has been scheduled onto the cluster, it -will remain running until you explicitly terminate it. This can be done -with ``det tensorboard kill ``: +Determined will automatically terminate idle TensorBoard instances. A +TensorBoard instance is considered idle if it is does not receive HTTP +traffic (a TensorBoard that is still being viewed by a web browser will not be +considered idle). By default, idle TensorBoards will be terminated after 5 minutes; the +timeout duration can be changed by editing ``tensorboard_timeout`` in the +:ref:`master config file `. + +You can also terminate TensorBoard instances by hand using ``det tensorboard +kill ``: .. code:: @@ -159,5 +165,5 @@ Implementation Details Determined schedules TensorBoard instances in containers that run on agent machines. The Determined master will proxy HTTP requests to and from the -TensorBoard container. Although TensorBoard instances are hosted on -agent machines, they do not occupy GPUs. +TensorBoard container. TensorBoard instances are hosted on agent machines but +they do not occupy GPUs. diff --git a/docs/reference/cluster-config.txt b/docs/reference/cluster-config.txt index 48bc31889c3..414d262799e 100644 --- a/docs/reference/cluster-config.txt +++ b/docs/reference/cluster-config.txt @@ -233,8 +233,10 @@ The master supports the following configuration settings: - ``root``: Specifies the root directory of the state files. Defaults to ``/usr/share/determined/master``. -- ``tensorboard_timeout``: Specifies the duration in seconds a TensorBoard - instance can be idle before it is automatically killed. +- ``tensorboard_timeout``: Specifies the duration in seconds before idle + TensorBoard instances are automatically terminated. A TensorBoard instance is + considered to be idle if it does not receive any HTTP traffic. The default + timeout is ``300`` (5 minutes). - ``provisioner``: Specifies the configuration of dynamic agents. @@ -246,7 +248,7 @@ The master supports the following configuration settings: ``public-ipv4``, ``local-hostname``, or ``public-hostname``. If the master is deployed on GCP, rather than hardcoding the IP address, we advise you use one of the following to set the host as - an alias: ``internal-ip`` or\ ``external-ip``. Which one you + an alias: ``internal-ip`` or ``external-ip``. Which one you should select is based on your network configuration. On master startup, we will replace the above alias host with its real value. Defaults to ``http`` as scheme, local IP address as host, and diff --git a/master/internal/command/tensorboard_manager.go b/master/internal/command/tensorboard_manager.go index 48c9e8ebf9e..70ea65e5876 100644 --- a/master/internal/command/tensorboard_manager.go +++ b/master/internal/command/tensorboard_manager.go @@ -90,7 +90,7 @@ func (t *tensorboardManager) Receive(ctx *actor.Context) error { } if time.Now().After(service.LastRequested.Add(t.timeout)) { - ctx.Log().Infof("Killing %s due to inactivity", boardSummary.Config.Description) + ctx.Log().Infof("killing %s due to inactivity", boardSummary.Config.Description) ctx.Ask(boardRef, &apiv1.KillTensorboardRequest{}) } }