Skip to content

Commit

Permalink
revert: make agent starting period configurable [DET-3219] (#623)
Browse files Browse the repository at this point in the history
This reverts commit 7f83e97.
  • Loading branch information
shiyuann authored Jun 3, 2020
1 parent 7f83e97 commit 897f2f6
Show file tree
Hide file tree
Showing 25 changed files with 459 additions and 587 deletions.
3 changes: 0 additions & 3 deletions deploy/determined_deploy/aws/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ def make_up_subparser(subparsers: argparse._SubParsersAction):
subparser.add_argument(
"--max-idle-agent-period", type=str, help="max agent idle time",
)
subparser.add_argument(
"--max-agent-starting-period", type=str, help="max agent starting time",
)
subparser.add_argument(
"--max-dynamic-agents",
type=int,
Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/aws/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class cloudformation:
BOTO3_SESSION = "Boto3Session"
AGENT_TAG_NAME = "AgentTagName"
MAX_IDLE_AGENT_PERIOD = "MaxIdleAgentPeriod"
MAX_AGENT_STARTING_PERIOD = "MaxAgentStartingPeriod"
MAX_DYNAMIC_AGENTS = "MaxDynamicAgents"
LOG_GROUP = "LogGroup"
REGION = "Region"
Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/aws/deployment_types/secure.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class Secure(base.DeterminedDeployment):
constants.cloudformation.VERSION,
constants.cloudformation.DB_PASSWORD,
constants.cloudformation.MAX_IDLE_AGENT_PERIOD,
constants.cloudformation.MAX_AGENT_STARTING_PERIOD,
constants.cloudformation.MAX_DYNAMIC_AGENTS,
]

Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/aws/deployment_types/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class Simple(base.DeterminedDeployment):
constants.cloudformation.VERSION,
constants.cloudformation.DB_PASSWORD,
constants.cloudformation.MAX_IDLE_AGENT_PERIOD,
constants.cloudformation.MAX_AGENT_STARTING_PERIOD,
constants.cloudformation.MAX_DYNAMIC_AGENTS,
]

Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/aws/deployment_types/vpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class VPC(base.DeterminedDeployment):
constants.cloudformation.VERSION,
constants.cloudformation.DB_PASSWORD,
constants.cloudformation.MAX_IDLE_AGENT_PERIOD,
constants.cloudformation.MAX_AGENT_STARTING_PERIOD,
constants.cloudformation.MAX_DYNAMIC_AGENTS,
]

Expand Down
6 changes: 0 additions & 6 deletions deploy/determined_deploy/aws/templates/secure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,6 @@ Parameters:
Description: How long before idle agents are shutdown
Default: 10m

MaxAgentStartingPeriod:
Type: String
Decription: How long for agent starting before retrying
Default: 10m

MaxDynamicAgents:
Type: Number
Description: Maximum number of agents to launch simultaneously
Expand Down Expand Up @@ -591,7 +586,6 @@ Resources:
log_stream: determined-agent
master_url: http://local-ipv4:8080
max_idle_agent_period: ${MaxIdleAgentPeriod}
max_agent_starting_period: ${MaxAgentStartingPeriod}
max_instances: ${MaxDynamicAgents}
network_interface:
public_ip: false
Expand Down
6 changes: 0 additions & 6 deletions deploy/determined_deploy/aws/templates/simple.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,6 @@ Parameters:
Description: How long before idle agents are shutdown
Default: 10m

MaxAgentStartingPeriod:
Type: String
Decription: How long for agent starting before retrying
Default: 10m

MaxDynamicAgents:
Type: Number
Description: Maximum number of agents to launch simultaneously
Expand Down Expand Up @@ -387,7 +382,6 @@ Resources:
log_stream: determined-agent
master_url: http://local-ipv4:8080
max_idle_agent_period: ${MaxIdleAgentPeriod}
max_agent_starting_period: ${MaxAgentStartingPeriod}
max_instances: ${MaxDynamicAgents}
network_interface:
public_ip: true
Expand Down
6 changes: 0 additions & 6 deletions deploy/determined_deploy/aws/templates/vpc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ Parameters:
Description: How long before idle agents are shutdown
Default: 10m

MaxAgentStartingPeriod:
Type: String
Decription: How long for agent starting before retrying
Default: 10m

MaxDynamicAgents:
Type: Number
Description: Maximum number of agents to launch simultaneously
Expand Down Expand Up @@ -513,7 +508,6 @@ Resources:
log_stream: determined-agent
master_url: http://local-ipv4:8080
max_idle_agent_period: ${MaxIdleAgentPeriod}
max_agent_starting_period: ${MaxAgentStartingPeriod}
max_instances: ${MaxDynamicAgents}
network_interface:
public_ip: true
Expand Down
6 changes: 0 additions & 6 deletions deploy/determined_deploy/gcp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,6 @@ def make_up_subparser(subparsers: argparse._SubParsersAction) -> None:
default=constants.defaults.MAX_IDLE_AGENT_PERIOD,
help="max agent idle time before it is shut down, e.g. 30m for 30 minutes",
)
optional_named.add_argument(
"--max-agent-starting-period",
type=str,
default=constants.defaults.MAX_AGENT_STARTING_PERIOD,
help="max agent starting time before retrying, e.g. 30m for 30 minutes",
)
optional_named.add_argument(
"--port",
type=int,
Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/gcp/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ class defaults:
GPU_TYPE = "nvidia-tesla-k80"
MASTER_INSTANCE_TYPE = "n1-standard-2"
MAX_IDLE_AGENT_PERIOD = "10m"
MAX_AGENT_STARTING_PERIOD = "10m"
OPERATION_TIMEOUT_PERIOD = "5m"
MAX_DYNAMIC_AGENTS = 5
STATIC_AGENTS = 0
Expand Down
1 change: 0 additions & 1 deletion deploy/determined_deploy/gcp/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ module "compute" {
agent_docker_network = var.agent_docker_network
agent_instance_type = var.agent_instance_type
max_idle_agent_period = var.max_idle_agent_period
max_agent_starting_period: var.max_agent_starting_period
gpu_type = var.gpu_type
gpu_num = var.gpu_num
max_dynamic_agents = var.max_dynamic_agents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ resource "google_compute_instance" "master_instance" {
master_url: ${var.scheme}://internal-ip:${var.port}
agent_docker_network: ${var.agent_docker_network}
max_idle_agent_period: ${var.max_idle_agent_period}
max_agent_starting_period: ${var.max_agent_starting_period}
provider: gcp
name_prefix: det-dynamic-agent-${var.unique_id}-
network_interface:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ variable "agent_docker_network" {
variable "max_idle_agent_period" {
}

variable "max_agent_starting_period" {
}

variable "tag_master_port" {
}

Expand Down
5 changes: 0 additions & 5 deletions deploy/determined_deploy/gcp/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ variable "max_idle_agent_period" {
default = "10m"
}

variable "max_agent_starting_period" {
type = string
default = "10m"
}

variable "min_cpu_platform_master" {
type = string
default = "Intel Skylake"
Expand Down
5 changes: 0 additions & 5 deletions docs/reference/cluster-config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,6 @@ The master supports the following configuration settings:
with optional fraction and a unit suffix, such as "30s", "1h", or
"1m30s". Valid time units are "s", "m", "h". The default value is ``5m``.

- ``max_agent_starting_period``: How long to wait for agents starting before
retrying. This string is a sequence of decimal numbers, each
with optional fraction and a unit suffix, such as "30s", "1h", or
"1m30s". Valid time units are "s", "m", "h". The default value is ``5m``.

- ``provider: aws``: Specifies running dynamic agents on AWS.
(*Required*)

Expand Down
4 changes: 1 addition & 3 deletions docs/topic-guides/elastic-infra/dynamic-agents-aws.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,8 @@ an example configuration. See :ref:`cluster-configuration` for details.
master_url: <scheme://host:port>
startup_script: <startup script>
container_startup_script: <container startup script>
agent_docker_network: host
agent_docker_image: <agent docker image>
agent_docker_network: determined
max_idle_agent_period: 5m
max_agent_starting_period: 5m

provider: aws
region: <region>
Expand Down
4 changes: 1 addition & 3 deletions docs/topic-guides/elastic-infra/dynamic-agents-gcp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,8 @@ an example configuration. See :ref:`cluster-configuration` for details.
master_url: <scheme://host:port>
startup_script: <startup script>
container_startup_script: <container startup script>
agent_docker_network: host
agent_docker_image: <agent docker image>
agent_docker_network: determined
max_idle_agent_period: 5m
max_agent_starting_period: 5m

provider: gcp
base_config: <instance resource base configuration>
Expand Down
8 changes: 3 additions & 5 deletions master/internal/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ scheduler:
provisioner:
max_idle_agent_period: 30s
max_agent_starting_period: 30s
`
expected := Config{
Log: logger.Config{
Expand All @@ -46,10 +45,9 @@ provisioner:
},
Scheduler: scheduler.Config{Fit: "best"},
Provisioner: &provisioner.Config{
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: provisioner.Duration(30 * time.Second),
MaxAgentStartingPeriod: provisioner.Duration(30 * time.Second),
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: provisioner.Duration(30 * time.Second),
},
}

Expand Down
10 changes: 3 additions & 7 deletions master/internal/provisioner/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,14 @@ type Config struct {
AWS *AWSClusterConfig `union:"provider,aws" json:"-"`
GCP *GCPClusterConfig `union:"provider,gcp" json:"-"`
MaxIdleAgentPeriod Duration `json:"max_idle_agent_period"`
MaxAgentStartingPeriod Duration `json:"max_agent_starting_period"`
}

// DefaultConfig returns the default configuration of the provisioner.
func DefaultConfig() *Config {
return &Config{
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(300 * time.Second),
MaxAgentStartingPeriod: Duration(300 * time.Second),
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(300 * time.Second),
}
}

Expand Down Expand Up @@ -102,8 +100,6 @@ func (c Config) Validate() []error {
check.False(c.AWS == nil && c.GCP == nil, "must configure aws or gcp cluster"),
check.GreaterThan(
int64(c.MaxIdleAgentPeriod), int64(0), "max idle agent period must be greater than 0"),
check.GreaterThan(
int64(c.MaxAgentStartingPeriod), int64(0), "max agent starting period must be greater than 0"),
}...)
return errs
}
Expand Down
52 changes: 23 additions & 29 deletions master/internal/provisioner/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ func TestProvisionerConfigMissingFields(t *testing.T) {
err = check.Validate(&config)
assert.ErrorContains(t, err, "must configure aws or gcp cluster")
expected := Config{
MaxIdleAgentPeriod: Duration(5 * time.Minute),
MaxAgentStartingPeriod: Duration(5 * time.Minute),
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(5 * time.Minute),
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
}
assert.DeepEqual(t, config, expected)
}
Expand All @@ -34,8 +33,7 @@ func TestUnmarshalProvisionerConfigMasterURL(t *testing.T) {
"region": "test.region3",
"image_id": "test.image3",
"ssh_key_name": "test-key3",
"max_idle_agent_period": "30s",
"max_agent_starting_period": "30s"
"max_idle_agent_period": "30s"
}`
config := Config{}
err := json.Unmarshal([]byte(configRaw), &config)
Expand All @@ -49,13 +47,12 @@ func TestUnmarshalProvisionerConfigMasterURL(t *testing.T) {
awsConfig.ImageID = "test.image3"
awsConfig.SSHKeyName = "test-key3"
unmarshaled := Config{
MasterURL: "http://test.master:8080",
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
AWS: &awsConfig,
MaxIdleAgentPeriod: Duration(30 * time.Second),
MaxAgentStartingPeriod: Duration(30 * time.Second),
MasterURL: "http://test.master:8080",
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
AWS: &awsConfig,
MaxIdleAgentPeriod: Duration(30 * time.Second),
}
assert.DeepEqual(t, config, unmarshaled)
}
Expand Down Expand Up @@ -83,8 +80,7 @@ func TestUnmarshalProvisionerConfigWithAWS(t *testing.T) {
"region": "test.region2",
"image_id": "test.image2",
"ssh_key_name": "test-key2",
"max_idle_agent_period": "30s",
"max_agent_starting_period": "30s"
"max_idle_agent_period": "30s"
}`
config := Config{}
err := json.Unmarshal([]byte(configRaw), &config)
Expand All @@ -98,13 +94,12 @@ func TestUnmarshalProvisionerConfigWithAWS(t *testing.T) {
awsConfig.ImageID = "test.image2"
awsConfig.SSHKeyName = "test-key2"
unmarshaled := Config{
MasterURL: "http://test.master:8080",
AWS: &awsConfig,
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(30 * time.Second),
MaxAgentStartingPeriod: Duration(30 * time.Second),
MasterURL: "http://test.master:8080",
AWS: &awsConfig,
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(30 * time.Second),
}
assert.DeepEqual(t, config, unmarshaled)
}
Expand All @@ -130,13 +125,12 @@ func TestUnmarshalProvisionerConfigWithGCP(t *testing.T) {
expected.Zone = "test-zone2"
expected.BootDiskSourceImage = "test-source_image2"
unmarshaled := Config{
MasterURL: "http://test.master:8080",
GCP: &expected,
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(5 * time.Minute),
MaxAgentStartingPeriod: Duration(5 * time.Minute),
MasterURL: "http://test.master:8080",
GCP: &expected,
AgentDockerImage: "test_image",
AgentDockerRuntime: "runc",
AgentDockerNetwork: "default",
MaxIdleAgentPeriod: Duration(5 * time.Minute),
}
assert.DeepEqual(t, config, unmarshaled)
}
7 changes: 2 additions & 5 deletions master/internal/provisioner/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,8 @@ func New(config *Config) (*Provisioner, error) {
}

return &Provisioner{
provider: cluster,
scaleDecider: newScaleDecider(
time.Duration(config.MaxIdleAgentPeriod),
time.Duration(config.MaxAgentStartingPeriod),
),
provider: cluster,
scaleDecider: newScaleDecider(time.Duration(config.MaxIdleAgentPeriod)),
}, nil
}

Expand Down
Loading

0 comments on commit 897f2f6

Please sign in to comment.