Skip to content

Commit

Permalink
feat: make GCP operation tracker timeout configuration [DET-3182] (#598)
Browse files Browse the repository at this point in the history
* feat: make GCP provider operation timeout configurable

* docs: update the cluster configuration doc for gcp operation timeout

* docs: update gcp topic guide with operation timeout

* docs: update default master configuration for gcp operation timeout

* feat: update GCP terraform script for operation timeout

* feat: update GCP deploy tool for operation timeout
  • Loading branch information
shiyuann authored Jun 2, 2020
1 parent 0011218 commit e0d0447
Show file tree
Hide file tree
Showing 12 changed files with 38 additions and 8 deletions.
6 changes: 6 additions & 0 deletions deploy/determined_deploy/gcp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ def make_up_subparser(subparsers: argparse._SubParsersAction) -> None:
default="false",
help="whether to use preemptible instances for agents",
)
optional_named.add_argument(
"--operation-timeout-period",
type=str,
default=constants.defaults.OPERATION_TIMEOUT_PERIOD,
help="operation timeout before retrying, e.g. 5m for 5 minutes",
)
optional_named.add_argument(
"--master-instance-type",
type=str,
Expand Down
1 change: 1 addition & 0 deletions deploy/determined_deploy/gcp/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class defaults:
GPU_TYPE = "nvidia-tesla-k80"
MASTER_INSTANCE_TYPE = "n1-standard-2"
MAX_IDLE_AGENT_PERIOD = "10m"
OPERATION_TIMEOUT_PERIOD = "5m"
MAX_DYNAMIC_AGENTS = 5
STATIC_AGENTS = 0
MIN_CPU_PLATFORM_MASTER = "Intel Skylake"
Expand Down
1 change: 1 addition & 0 deletions deploy/determined_deploy/gcp/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ module "compute" {
min_cpu_platform_master = var.min_cpu_platform_master
min_cpu_platform_agent = var.min_cpu_platform_agent
preemptible = var.preemptible
operation_timeout_period = var.operation_timeout_period
db_username = var.db_username
db_password = var.db_password

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ resource "google_compute_instance" "master_instance" {
gpu_num: ${var.gpu_num}
preemptible: ${var.preemptible}
max_instances: ${var.max_dynamic_agents}
operation_timeout_period: ${var.operation_timeout_period}
base_config:
minCpuPlatform: ${var.min_cpu_platform_agent}
EOF
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ variable "static_agents" {
variable "preemptible" {
}

variable "operation_timeout_period" {
}

variable "gcs_bucket" {
}

Expand Down
5 changes: 5 additions & 0 deletions deploy/determined_deploy/gcp/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ variable "preemptible" {
default = false
}

variable "operation_timeout_period" {
type = string
default = "5m"
}

variable "agent_docker_network" {
type = string
default = "host"
Expand Down
5 changes: 5 additions & 0 deletions docs/reference/cluster-config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,11 @@ The master supports the following configuration settings:
- ``max_instances``: Max number of Determined agent instances. Defaults
to 5.

- ``operation_timeout_period``: The timeout period for tracking a GCP
operation. This string is a sequence of decimal numbers, each
with optional fraction and a unit suffix, such as "30s", "1h", or
"1m30s". Valid time units are "s", "m", "h". The default value is ``5m``.

- ``checkpoint_storage``: Specifies where model checkpoints will be
stored. This can be overridden on a per-experiment basis in the
:ref:`experiment-configuration`. A checkpoint contains the
Expand Down
1 change: 1 addition & 0 deletions docs/topic-guides/elastic-infra/dynamic-agents-gcp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ an example configuration. See :ref:`cluster-configuration` for details.
gpu_num: 4
preemptible: false
max_instances: 5
operation_timeout_period: 5m

.. _gcp-attach-disk:

Expand Down
6 changes: 5 additions & 1 deletion master/internal/provisioner/gcp_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"strings"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/pkg/errors"
Expand Down Expand Up @@ -40,6 +41,8 @@ type GCPClusterConfig struct {

InstanceType gceInstanceType `json:"instance_type"`
MaxInstances int `json:"max_instances"`

OperationTimeoutPeriod Duration `json:"operation_timeout_period"`
}

// DefaultGCPClusterConfig returns the default configuration of the gcp cluster.
Expand All @@ -52,7 +55,8 @@ func DefaultGCPClusterConfig() *GCPClusterConfig {
GPUType: "nvidia-tesla-v100",
GPUNum: 4,
},
MaxInstances: 5,
MaxInstances: 5,
OperationTimeoutPeriod: Duration(5 * time.Minute),
}
}

Expand Down
7 changes: 5 additions & 2 deletions master/internal/provisioner/gcp_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package provisioner
import (
"encoding/json"
"testing"
"time"

"gotest.tools/assert"

Expand Down Expand Up @@ -53,7 +54,8 @@ func TestUnmarshalGCPClusterConfig(t *testing.T) {
"gpu_type": "nvidia-tesla-v100",
"gpu_num": 2
},
"max_instances": 100
"max_instances": 100,
"operation_timeout_period": "5m"
}`,
unmarshaled: GCPClusterConfig{
Project: "test-project",
Expand All @@ -78,7 +80,8 @@ func TestUnmarshalGCPClusterConfig(t *testing.T) {
GPUType: "nvidia-tesla-v100",
GPUNum: 2,
},
MaxInstances: 100,
MaxInstances: 100,
OperationTimeoutPeriod: Duration(5 * time.Minute),
},
}

Expand Down
9 changes: 4 additions & 5 deletions master/internal/provisioner/gcp_operation_tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ import (
"github.com/determined-ai/determined/master/pkg/actor/actors"
)

const operationTimeoutDuration = 300 * time.Second
const batchOperationTimeoutDuration = 400 * time.Second

type (
trackerTimeout struct{}
trackerTick struct{}
Expand Down Expand Up @@ -125,7 +122,9 @@ type gcpBatchOperationTracker struct {
func (t *gcpBatchOperationTracker) Receive(ctx *actor.Context) error {
switch msg := ctx.Message().(type) {
case actor.PreStart:
actors.NotifyAfter(ctx, batchOperationTimeoutDuration, trackerTimeout{})
batchOperationTimeoutPeriod :=
time.Duration(len(t.ops)) * time.Duration(t.config.OperationTimeoutPeriod)
actors.NotifyAfter(ctx, batchOperationTimeoutPeriod, trackerTimeout{})
t.doneOps = make([]trackOperationDone, 0, len(t.ops))
for _, op := range t.ops {
if _, ok := ctx.ActorOf(
Expand All @@ -134,7 +133,7 @@ func (t *gcpBatchOperationTracker) Receive(ctx *actor.Context) error {
config: t.config,
client: t.client,
op: op,
timeout: operationTimeoutDuration,
timeout: time.Duration(t.config.OperationTimeoutPeriod),
},
); !ok {
return errors.New("internal error tracking GCP operation")
Expand Down
1 change: 1 addition & 0 deletions master/packaging/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
# gpu_type: nvidia-tesla-v100
# gpu_num: 4
# max_instances: 5
# operation_timeout_period: 5m


## Configure default checkpoint storage configuration.
Expand Down

0 comments on commit e0d0447

Please sign in to comment.