Skip to content

Commit

Permalink
cloud/awscloud: retry CreateFleet regardless of the error code
Browse files Browse the repository at this point in the history
The errors returned by create fleet are not entirely clear. It seems it
also returns `InsufficientInstanceCapacity` in addition to
`UnfulfillableCapacity`. Let's just retry three times regardless of the
create fleet error, that way there's no need to chase error codes which
aren't clearly defined.
  • Loading branch information
croissanne committed Oct 15, 2024
1 parent 7396823 commit 5eb8227
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
10 changes: 5 additions & 5 deletions internal/cloud/awscloud/secure-instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,14 +547,14 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
return nil, fmt.Errorf("Unable to create spot fleet: %w", err)
}

if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" {
logrus.Warn("Received UnfulfillableCapacity from CreateFleet, retrying CreateFleet with OnDemand instance")
if len(createFleetOutput.Errors) > 0 {
logrus.Warnf("Received error %s from CreateFleet, retrying CreateFleet with OnDemand instance", *createFleetOutput.Errors[0].ErrorCode)
input.SpotOptions = nil
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
}

if len(createFleetOutput.Errors) > 0 && *createFleetOutput.Errors[0].ErrorCode == "UnfulfillableCapacity" {
logrus.Warn("Received UnfulfillableCapacity from CreateFleet with OnDemand instance option, retrying across availability zones")
if err == nil && len(createFleetOutput.Errors) > 0 {
logrus.Warnf("Received error %s from CreateFleet with OnDemand instance option, retrying across availability zones", *createFleetOutput.Errors[0].ErrorCode)
input.LaunchTemplateConfigs[0].Overrides = nil
createFleetOutput, err = a.ec2.CreateFleet(context.Background(), input)
}
Expand All @@ -566,7 +566,7 @@ func (a *AWS) createFleet(input *ec2.CreateFleetInput) (*ec2.CreateFleetOutput,
if len(createFleetOutput.Errors) > 0 {
fleetErrs := []string{}
for _, fleetErr := range createFleetOutput.Errors {
fleetErrs = append(fleetErrs, *fleetErr.ErrorMessage)
fleetErrs = append(fleetErrs, fmt.Sprintf("%s: %s", *fleetErr.ErrorCode, *fleetErr.ErrorMessage))
}
return nil, fmt.Errorf("Unable to create fleet: %v", strings.Join(fleetErrs, "; "))
}
Expand Down
2 changes: 1 addition & 1 deletion internal/cloud/awscloud/secure-instance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func TestSICreateFleetFailures(t *testing.T) {
aws := awscloud.NewForTest(m, &ec2imdsmock{t, "instance-id", "region1"}, nil, nil, nil)
require.NotNil(t, aws)

// unfillable capacity should call create fleet thrice
// create fleet error should call create fleet thrice
m.failFn["CreateFleet"] = nil
si, err := aws.RunSecureInstance("iam-profile", "key-name", "cw-group", "hostname")
require.Error(t, err)
Expand Down

0 comments on commit 5eb8227

Please sign in to comment.