From c21b4e8a5e03f41f7809ed2afba51b6461cbbd8f Mon Sep 17 00:00:00 2001 From: Sanne Raymaekers Date: Mon, 7 Oct 2024 12:49:25 +0200 Subject: [PATCH] cloud/aws: add a third secure instance fallback across AZs In case the on demand option failed as well, retry one more time across availability zones. This significantly increases the pool of available instances, but increases network related costs, as transferring data between AZs is not free. --- internal/cloud/awscloud/secure-instance.go | 9 ++++++++- internal/cloud/awscloud/secure-instance_test.go | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/cloud/awscloud/secure-instance.go b/internal/cloud/awscloud/secure-instance.go index e4c9ce9f1e..98749cd022 100644 --- a/internal/cloud/awscloud/secure-instance.go +++ b/internal/cloud/awscloud/secure-instance.go @@ -552,8 +552,15 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput, input.SpotOptions = nil createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) } + + if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" { + logrus.Warn("Received UnfulfillableCapacity from CreateFleet with OnDemand instance option, retrying across availability zones") + input.LaunchTemplateConfigs[0].Overrides = nil + createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input) + } + if err != nil { - return nil, fmt.Errorf("Unable to create on-demand fleet: %w", err) + return nil, fmt.Errorf("Unable to create fleet, tried on-demand and across AZs: %w", err) } if len(createFleetOutput.Errors) > 0 { diff --git a/internal/cloud/awscloud/secure-instance_test.go b/internal/cloud/awscloud/secure-instance_test.go index 6d8fc6338e..ad27d342b5 100644 --- a/internal/cloud/awscloud/secure-instance_test.go +++ b/internal/cloud/awscloud/secure-instance_test.go @@ -142,12 +142,12 @@ func TestSICreateFleetFailures(t *testing.T) { aws := awscloud.NewForTest(m, &ec2imdsmock{t, "instance-id", "region1"}, nil, nil, nil) require.NotNil(t, aws) - // unfillable capacity should call create fleet twice + // unfillable capacity should call create fleet thrice m.failFn["CreateFleet"] = nil si, err := aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname") require.Error(t, err) require.Nil(t, si) - require.Equal(t, 2, m.calledFn["CreateFleet"]) + require.Equal(t, 3, m.calledFn["CreateFleet"]) require.Equal(t, 1, m.calledFn["CreateSecurityGroup"]) require.Equal(t, 1, m.calledFn["CreateLaunchTemplate"]) require.Equal(t, 2, m.calledFn["DeleteSecurityGroup"]) @@ -158,7 +158,7 @@ func TestSICreateFleetFailures(t *testing.T) { si, err = aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname") require.Error(t, err) require.Nil(t, si) - require.Equal(t, 3, m.calledFn["CreateFleet"]) + require.Equal(t, 4, m.calledFn["CreateFleet"]) require.Equal(t, 2, m.calledFn["CreateSecurityGroup"]) require.Equal(t, 2, m.calledFn["CreateLaunchTemplate"]) require.Equal(t, 4, m.calledFn["DeleteSecurityGroup"])