Skip to content

Commit

Permalink
Inject volumeWaitParameters dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewSirenko committed May 1, 2024
1 parent 48b2755 commit 1bff4e6
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 55 deletions.
98 changes: 48 additions & 50 deletions pkg/cloud/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,6 @@ var (
VolumeTypeST1,
VolumeTypeStandard,
}

volumeCreationPollInitialDelay = 1250 * time.Millisecond
volumeCreationPollBackoffDuration = 500 * time.Millisecond
volumeCreationPollBackoffFactor = 1.5
volumeCreationPollBackoffSteps = 25
volumeCreationPollBackoffCap = 3 * time.Second

volumeModificationDuration = 1 * time.Second
volumeModificationWaitFactor = 1.7
volumeModificationWaitSteps = 10

volumeAttachmentStatePollSteps = 13
volumeAttachmentStatePollDelay = 1 * time.Second
volumeAttachmentStatePollFactor = 1.8
)

const (
Expand Down Expand Up @@ -265,6 +251,47 @@ type ec2ListSnapshotsResponse struct {
NextToken *string
}

// volumeWaitParameters dictates how to poll for volume events.
// E.g. how often to check if volume is created after an EC2 CreateVolume call.
type volumeWaitParameters struct {
creationInitialDelay time.Duration
creationBackoff wait.Backoff
modificationBackoff wait.Backoff
attachmentBackoff wait.Backoff
}

var (
vwp = volumeWaitParameters{
// Based on our testing in us-west-2 and ap-south-1, the median/p99 time until volume creation is ~1.5/~4 seconds.
// We have found that the following parameters are optimal for minimizing provisioning time and DescribeVolumes calls
// we queue DescribeVolume calls after [1.25, 0.5, 0.75, 1.125, 1.7, 2.5, 3] seconds.
// In total, we wait for ~60 seconds.
creationInitialDelay: 1250 * time.Millisecond,
creationBackoff: wait.Backoff{
Duration: 500 * time.Millisecond,
Factor: 1.5,
Steps: 25,
Cap: 3 * time.Second,
},

// Most attach/detach operations on AWS finish within 1-4 seconds.
// By using 1 second starting interval with a backoff of 1.8,
// we get [1, 1.8, 3.24, 5.832000000000001, 10.4976].
// In total, we wait for 2601 seconds.
attachmentBackoff: wait.Backoff{
Duration: 1 * time.Second,
Factor: 1.8,
Steps: 13,
},

modificationBackoff: wait.Backoff{
Duration: 1 * time.Second,
Factor: 1.7,
Steps: 10,
},
}
)

// volumeBatcherType is an enum representing the types of volume batchers available.
type volumeBatcherType int

Expand All @@ -287,6 +314,7 @@ type cloud struct {
dm dm.DeviceManager
bm *batcherManager
rm *retryManager
vwp volumeWaitParameters
}

var _ Cloud = &cloud{}
Expand Down Expand Up @@ -343,14 +371,13 @@ func newEC2Cloud(region string, awsSdkDebugLog bool, userAgentExtra string, batc
bm = newBatcherManager(svc)
}

rm := newRetryManager()

return &cloud{
region: region,
dm: dm.NewDeviceManager(),
ec2: svc,
bm: bm,
rm: rm,
rm: newRetryManager(),
vwp: vwp,
}
}

Expand Down Expand Up @@ -967,16 +994,6 @@ func (c *cloud) DetachDisk(ctx context.Context, volumeID, nodeID string) error {

// WaitForAttachmentState polls until the attachment status is the expected value.
func (c *cloud) WaitForAttachmentState(ctx context.Context, volumeID, expectedState string, expectedInstance string, expectedDevice string, alreadyAssigned bool) (*types.VolumeAttachment, error) {
// Most attach/detach operations on AWS finish within 1-4 seconds.
// By using 1 second starting interval with a backoff of 1.8,
// we get [1, 1.8, 3.24, 5.832000000000001, 10.4976].
// In total we wait for 2601 seconds.
backoff := wait.Backoff{
Duration: volumeAttachmentStatePollDelay,
Factor: volumeAttachmentStatePollFactor,
Steps: volumeAttachmentStatePollSteps,
}

var attachment *types.VolumeAttachment

verifyVolumeFunc := func(ctx context.Context) (bool, error) {
Expand Down Expand Up @@ -1063,7 +1080,7 @@ func (c *cloud) WaitForAttachmentState(ctx context.Context, volumeID, expectedSt
return false, nil
}

return attachment, wait.ExponentialBackoffWithContext(ctx, backoff, verifyVolumeFunc)
return attachment, wait.ExponentialBackoffWithContext(ctx, c.vwp.attachmentBackoff, verifyVolumeFunc)
}

func (c *cloud) GetDiskByName(ctx context.Context, name string, capacityBytes int64) (*Disk, error) {
Expand Down Expand Up @@ -1533,26 +1550,13 @@ func (c *cloud) listSnapshots(ctx context.Context, request *ec2.DescribeSnapshot

// waitForVolume waits for volume to be in the "available" state.
func (c *cloud) waitForVolume(ctx context.Context, volumeID string) error {
// Based on our testing in us-west-2 and ap-south-1, the median/p99 time until volume creation is ~1.5/~4 seconds.
// We have found that the following parameters are optimal for minimizing provisioning time and DescribeVolumes calls
// we queue DescribeVolume calls after [1.25, 0.5, 0.75, 1.125, 1.7, 2.5, 3] seconds.
// In total, we wait for ~60 seconds.
var (
backoff = wait.Backoff{
Duration: volumeCreationPollBackoffDuration,
Factor: volumeCreationPollBackoffFactor,
Steps: volumeCreationPollBackoffSteps,
Cap: volumeCreationPollBackoffCap,
}
)

time.Sleep(volumeCreationPollInitialDelay)
time.Sleep(c.vwp.creationInitialDelay)

request := &ec2.DescribeVolumesInput{
VolumeIds: []string{volumeID},
}

err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (done bool, err error) {
err := wait.ExponentialBackoffWithContext(ctx, c.vwp.creationBackoff, func(ctx context.Context) (done bool, err error) {
vol, err := c.getVolume(ctx, request)
if err != nil {
return true, err
Expand Down Expand Up @@ -1682,13 +1686,7 @@ func (c *cloud) checkDesiredState(ctx context.Context, volumeID string, desiredS

// waitForVolumeModification waits for a volume modification to finish.
func (c *cloud) waitForVolumeModification(ctx context.Context, volumeID string) error {
backoff := wait.Backoff{
Duration: volumeModificationDuration,
Factor: volumeModificationWaitFactor,
Steps: volumeModificationWaitSteps,
}

waitErr := wait.ExponentialBackoff(backoff, func() (bool, error) {
waitErr := wait.ExponentialBackoff(c.vwp.modificationBackoff, func() (bool, error) {
m, err := c.getLatestVolumeModification(ctx, volumeID, true)
// Consider volumes that have never been modified as done
if err != nil && errors.Is(err, VolumeNotBeingModified) {
Expand Down
23 changes: 18 additions & 5 deletions pkg/cloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"errors"
"fmt"
"k8s.io/apimachinery/pkg/util/wait"
"reflect"
"strings"
"sync"
Expand Down Expand Up @@ -931,7 +932,7 @@ func TestCreateDisk(t *testing.T) {
AvailabilityZone: "",
},
expCreateVolumeInput: &ec2.CreateVolumeInput{},
expErr: fmt.Errorf("timed out waiting for volume to create: context deadline exceeded"),
expErr: fmt.Errorf("timed out waiting for volume to create: timed out waiting for the condition"),
},
{
name: "success: normal from snapshot",
Expand Down Expand Up @@ -2387,8 +2388,6 @@ func TestResizeOrModifyDisk(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
mockCtrl := gomock.NewController(t)
mockEC2 := NewMockEC2API(mockCtrl)
// reduce number of steps to reduce test time
volumeModificationWaitSteps = 3
c := newCloud(mockEC2)

ctx := context.Background()
Expand Down Expand Up @@ -2990,8 +2989,6 @@ func TestWaitForAttachmentState(t *testing.T) {
},
}

volumeAttachmentStatePollSteps = 1

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
mockCtrl := gomock.NewController(t)
Expand Down Expand Up @@ -3072,12 +3069,28 @@ func TestWaitForAttachmentState(t *testing.T) {
}
}

func testVolumeWaitParameters() volumeWaitParameters {
testBackoff := wait.Backoff{
Duration: 100 * time.Millisecond,
Factor: 1,
Steps: 3,
}

return volumeWaitParameters{
creationInitialDelay: 0,
creationBackoff: testBackoff,
attachmentBackoff: testBackoff,
modificationBackoff: testBackoff,
}
}

func newCloud(mockEC2 EC2API) Cloud {
c := &cloud{
region: "test-region",
dm: dm.NewDeviceManager(),
ec2: mockEC2,
rm: newRetryManager(),
vwp: testVolumeWaitParameters(),
}
return c
}
Expand Down
16 changes: 16 additions & 0 deletions pkg/cloud/retry_manager.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cloud

import (
Expand Down

0 comments on commit 1bff4e6

Please sign in to comment.