feat: make GCP operation tracker timeout configuration [DET-3182] (#598)

* feat: make GCP provider operation timeout configurable * docs: update the cluster configuration doc for gcp operation timeout * docs: update gcp topic guide with operation timeout * docs: update default master configuration for gcp operation timeout * feat: update GCP terraform script for operation timeout * feat: update GCP deploy tool for operation timeout
determined-ai · Jun 2, 2020 · e0d0447 · e0d0447
1 parent 0011218
commit e0d0447
Show file tree

Hide file tree

Showing 12 changed files with 38 additions and 8 deletions.
diff --git a/deploy/determined_deploy/gcp/cli.py b/deploy/determined_deploy/gcp/cli.py
@@ -87,6 +87,12 @@ def make_up_subparser(subparsers: argparse._SubParsersAction) -> None:
         default="false",
         help="whether to use preemptible instances for agents",
     )
+    optional_named.add_argument(
+        "--operation-timeout-period",
+        type=str,
+        default=constants.defaults.OPERATION_TIMEOUT_PERIOD,
+        help="operation timeout before retrying, e.g. 5m for 5 minutes",
+    )
     optional_named.add_argument(
         "--master-instance-type",
         type=str,

diff --git a/deploy/determined_deploy/gcp/constants.py b/deploy/determined_deploy/gcp/constants.py
@@ -7,6 +7,7 @@ class defaults:
     GPU_TYPE = "nvidia-tesla-k80"
     MASTER_INSTANCE_TYPE = "n1-standard-2"
     MAX_IDLE_AGENT_PERIOD = "10m"
+    OPERATION_TIMEOUT_PERIOD = "5m"
     MAX_DYNAMIC_AGENTS = 5
     STATIC_AGENTS = 0
     MIN_CPU_PLATFORM_MASTER = "Intel Skylake"

diff --git a/deploy/determined_deploy/gcp/terraform/main.tf b/deploy/determined_deploy/gcp/terraform/main.tf
@@ -132,6 +132,7 @@ module "compute" {
   min_cpu_platform_master = var.min_cpu_platform_master
   min_cpu_platform_agent = var.min_cpu_platform_agent
   preemptible = var.preemptible
+  operation_timeout_period = var.operation_timeout_period
   db_username = var.db_username
   db_password = var.db_password
 

diff --git a/deploy/determined_deploy/gcp/terraform/modules/compute/main.tf b/deploy/determined_deploy/gcp/terraform/modules/compute/main.tf
@@ -59,6 +59,7 @@ resource "google_compute_instance" "master_instance" {
         gpu_num: ${var.gpu_num}
         preemptible: ${var.preemptible}
       max_instances: ${var.max_dynamic_agents}
+      operation_timeout_period: ${var.operation_timeout_period}
       base_config:
         minCpuPlatform: ${var.min_cpu_platform_agent}
     EOF

diff --git a/deploy/determined_deploy/gcp/terraform/modules/compute/variables.tf b/deploy/determined_deploy/gcp/terraform/modules/compute/variables.tf
@@ -67,6 +67,9 @@ variable "static_agents" {
 variable "preemptible" {
 }
 
+variable "operation_timeout_period" {
+}
+
 variable "gcs_bucket" {
 }
 

diff --git a/deploy/determined_deploy/gcp/terraform/variables.tf b/deploy/determined_deploy/gcp/terraform/variables.tf
@@ -85,6 +85,11 @@ variable "preemptible" {
   default = false
 }
 
+variable "operation_timeout_period" {
+  type = string
+  default = "5m"
+}
+
 variable "agent_docker_network" {
   type = string
   default = "host"

diff --git a/docs/reference/cluster-config.txt b/docs/reference/cluster-config.txt
@@ -421,6 +421,11 @@ The master supports the following configuration settings:
     - ``max_instances``: Max number of Determined agent instances. Defaults
       to 5.
 
+    - ``operation_timeout_period``: The timeout period for tracking a GCP
+      operation. This string is a sequence of decimal numbers, each
+      with optional fraction and a unit suffix, such as "30s", "1h", or
+      "1m30s". Valid time units are "s", "m", "h". The default value is ``5m``.
+
 - ``checkpoint_storage``: Specifies where model checkpoints will be
   stored. This can be overridden on a per-experiment basis in the
   :ref:`experiment-configuration`. A checkpoint contains the

diff --git a/docs/topic-guides/elastic-infra/dynamic-agents-gcp.txt b/docs/topic-guides/elastic-infra/dynamic-agents-gcp.txt
@@ -129,6 +129,7 @@ an example configuration. See :ref:`cluster-configuration` for details.
        gpu_num: 4
        preemptible: false
      max_instances: 5
+     operation_timeout_period: 5m
 
 .. _gcp-attach-disk:
 

diff --git a/master/internal/provisioner/gcp_config.go b/master/internal/provisioner/gcp_config.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"time"
 
 	"cloud.google.com/go/compute/metadata"
 	"github.com/pkg/errors"
@@ -40,6 +41,8 @@ type GCPClusterConfig struct {
 
 	InstanceType gceInstanceType `json:"instance_type"`
 	MaxInstances int             `json:"max_instances"`
+
+	OperationTimeoutPeriod Duration `json:"operation_timeout_period"`
 }
 
 // DefaultGCPClusterConfig returns the default configuration of the gcp cluster.
@@ -52,7 +55,8 @@ func DefaultGCPClusterConfig() *GCPClusterConfig {
 			GPUType:     "nvidia-tesla-v100",
 			GPUNum:      4,
 		},
-		MaxInstances: 5,
+		MaxInstances:           5,
+		OperationTimeoutPeriod: Duration(5 * time.Minute),
 	}
 }
 

diff --git a/master/internal/provisioner/gcp_config_test.go b/master/internal/provisioner/gcp_config_test.go
@@ -3,6 +3,7 @@ package provisioner
 import (
 	"encoding/json"
 	"testing"
+	"time"
 
 	"gotest.tools/assert"
 
@@ -53,7 +54,8 @@ func TestUnmarshalGCPClusterConfig(t *testing.T) {
 		"gpu_type": "nvidia-tesla-v100",
 		"gpu_num": 2
 	},
-	"max_instances": 100
+	"max_instances": 100,
+	"operation_timeout_period": "5m"
 }`,
 		unmarshaled: GCPClusterConfig{
 			Project:             "test-project",
@@ -78,7 +80,8 @@ func TestUnmarshalGCPClusterConfig(t *testing.T) {
 				GPUType:     "nvidia-tesla-v100",
 				GPUNum:      2,
 			},
-			MaxInstances: 100,
+			MaxInstances:           100,
+			OperationTimeoutPeriod: Duration(5 * time.Minute),
 		},
 	}
 

diff --git a/master/internal/provisioner/gcp_operation_tracker.go b/master/internal/provisioner/gcp_operation_tracker.go
@@ -14,9 +14,6 @@ import (
 	"github.com/determined-ai/determined/master/pkg/actor/actors"
 )
 
-const operationTimeoutDuration = 300 * time.Second
-const batchOperationTimeoutDuration = 400 * time.Second
-
 type (
 	trackerTimeout     struct{}
 	trackerTick        struct{}
@@ -125,7 +122,9 @@ type gcpBatchOperationTracker struct {
 func (t *gcpBatchOperationTracker) Receive(ctx *actor.Context) error {
 	switch msg := ctx.Message().(type) {
 	case actor.PreStart:
-		actors.NotifyAfter(ctx, batchOperationTimeoutDuration, trackerTimeout{})
+		batchOperationTimeoutPeriod :=
+			time.Duration(len(t.ops)) * time.Duration(t.config.OperationTimeoutPeriod)
+		actors.NotifyAfter(ctx, batchOperationTimeoutPeriod, trackerTimeout{})
 		t.doneOps = make([]trackOperationDone, 0, len(t.ops))
 		for _, op := range t.ops {
 			if _, ok := ctx.ActorOf(
@@ -134,7 +133,7 @@ func (t *gcpBatchOperationTracker) Receive(ctx *actor.Context) error {
 					config:  t.config,
 					client:  t.client,
 					op:      op,
-					timeout: operationTimeoutDuration,
+					timeout: time.Duration(t.config.OperationTimeoutPeriod),
 				},
 			); !ok {
 				return errors.New("internal error tracking GCP operation")

diff --git a/master/packaging/master.yaml b/master/packaging/master.yaml
@@ -85,6 +85,7 @@
 #    gpu_type: nvidia-tesla-v100
 #    gpu_num: 4
 #  max_instances: 5
+#  operation_timeout_period: 5m
 
 
 ## Configure default checkpoint storage configuration.