diff --git a/pkg/cluster/deploybaseresources.go b/pkg/cluster/deploybaseresources.go index df77101ce7b..4ffc3389d62 100644 --- a/pkg/cluster/deploybaseresources.go +++ b/pkg/cluster/deploybaseresources.go @@ -10,6 +10,7 @@ import ( "net/http" "regexp" "strings" + "time" mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network" mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" @@ -17,6 +18,7 @@ import ( "github.com/Azure/go-autorest/autorest/azure" "github.com/Azure/go-autorest/autorest/to" utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/apimachinery/pkg/util/wait" "github.com/Azure/ARO-RP/pkg/api" apisubnet "github.com/Azure/ARO-RP/pkg/api/util/subnet" @@ -254,65 +256,103 @@ func (m *manager) subnetsWithServiceEndpoint(ctx context.Context, serviceEndpoin return subnets, nil } -func (m *manager) attachNSGs(ctx context.Context) (bool, error) { +// attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not +// enabled. This method is suitable for use with steps, and has default +// timeout/polls set. +func (m *manager) attachNSGs(ctx context.Context) error { + // Since we need to guard against the case where NSGs are not ready + // immediately after creation, we can have a relatively short retry period + // of 30s and timeout of 3m. These numbers were chosen via a + // highly-non-specific and data-adjacent process (picking them because they + // seemed decent enough). + // + // If we get the NSG not-ready error after 3 minutes, it's unusual enough + // that we should be raising it as an issue rather than tolerating it. + return m._attachNSGs(ctx, 3*time.Minute, 30*time.Second) +} + +// _attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not +// enabled. timeout and pollInterval are provided as arguments for testing +// reasons. +func (m *manager) _attachNSGs(ctx context.Context, timeout time.Duration, pollInterval time.Duration) error { if m.doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled { - return true, nil + return nil } + var innerErr error + workerProfiles, _ := api.GetEnrichedWorkerProfiles(m.doc.OpenShiftCluster.Properties) workerSubnetId := workerProfiles[0].SubnetID - for _, subnetID := range []string{ - m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID, - workerSubnetId, - } { - m.log.Printf("attaching network security group to subnet %s", subnetID) - - // TODO: there is probably an undesirable race condition here - check if etags can help. - - s, err := m.subnet.Get(ctx, subnetID) - if err != nil { - return false, err - } + timeoutCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + // This polling function protects the case below where the NSG might not be + // ready to be referenced. We don't guard against trying to re-attach the + // NSG since the inner loop is tolerant of that, and since we are attaching + // the same NSG the only allowed failure case is when the NSG cannot be + // attached to begin with, so it shouldn't happen in practice. + _ = wait.PollImmediateUntil(pollInterval, func() (bool, error) { + var c bool + c, innerErr = func() (bool, error) { + for _, subnetID := range []string{ + m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID, + workerSubnetId, + } { + m.log.Printf("attaching network security group to subnet %s", subnetID) + // TODO: there is probably an undesirable race condition here - check if etags can help. + + // We use the outer context, not the timeout context, as we do not want + // to time out the condition function itself, only stop retrying once + // timeoutCtx's timeout has fired. + s, err := m.subnet.Get(ctx, subnetID) + if err != nil { + return false, err + } - if s.SubnetPropertiesFormat == nil { - s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{} - } + if s.SubnetPropertiesFormat == nil { + s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{} + } - nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID) - if err != nil { - return false, err - } + nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID) + if err != nil { + return false, err + } - // Sometimes we get into the race condition between external services modifying - // subnets and our validation code. We try to catch this early, but - // these errors is propagated to make the user-facing error more clear incase - // modification happened after we ran validation code and we lost the race - if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil { - if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) { - continue - } + // Sometimes we get into the race condition between external services modifying + // subnets and our validation code. We try to catch this early, but + // these errors is propagated to make the user-facing error more clear incase + // modification happened after we ran validation code and we lost the race + if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil { + if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) { + continue + } - return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID) - } + return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID) + } - s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{ - ID: to.StringPtr(nsgID), - } + s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{ + ID: to.StringPtr(nsgID), + } - // Because we attempt to attach the NSG immediately after the base resource deployment - // finishes, the NSG is sometimes not yet ready to be referenced and used, causing - // an error to occur here. So if this particular error occurs, return nil to retry. - // But if some other type of error occurs, just return that error. - err = m.subnet.CreateOrUpdate(ctx, subnetID, s) - if err != nil { - if nsgNotReadyErrorRegex.MatchString(err.Error()) { - return false, nil + // Because we attempt to attach the NSG immediately after the base resource deployment + // finishes, the NSG is sometimes not yet ready to be referenced and used, causing + // an error to occur here. So if this particular error occurs, return nil to retry. + // But if some other type of error occurs, just return that error. + err = m.subnet.CreateOrUpdate(ctx, subnetID, s) + if err != nil { + if nsgNotReadyErrorRegex.MatchString(err.Error()) { + return false, nil + } + return false, err + } } - return false, err - } - } + return true, nil + }() + + return c, innerErr + }, timeoutCtx.Done()) - return true, nil + return innerErr } func (m *manager) setMasterSubnetPolicies(ctx context.Context) error { diff --git a/pkg/cluster/deploybaseresources_test.go b/pkg/cluster/deploybaseresources_test.go index 48faa427778..74ec15a44a5 100644 --- a/pkg/cluster/deploybaseresources_test.go +++ b/pkg/cluster/deploybaseresources_test.go @@ -12,6 +12,7 @@ import ( "sort" "strings" "testing" + "time" mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network" mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features" @@ -242,11 +243,10 @@ func TestAttachNSGs(t *testing.T) { ctx := context.Background() for _, tt := range []struct { - name string - oc *api.OpenShiftClusterDocument - mocks func(*mock_subnet.MockManager) - wantResult bool - wantErr string + name string + oc *api.OpenShiftClusterDocument + mocks func(*mock_subnet.MockManager) + wantErr string }{ { name: "Success - NSG attached to both subnets", @@ -287,7 +287,6 @@ func TestAttachNSGs(t *testing.T) { }, }).Return(nil) }, - wantResult: true, }, { name: "Success - preconfigured NSG enabled", @@ -313,8 +312,7 @@ func TestAttachNSGs(t *testing.T) { }, }, }, - mocks: func(subnet *mock_subnet.MockManager) {}, - wantResult: true, + mocks: func(subnet *mock_subnet.MockManager) {}, }, { name: "Failure - unable to get a subnet", @@ -340,8 +338,7 @@ func TestAttachNSGs(t *testing.T) { mocks: func(subnet *mock_subnet.MockManager) { subnet.EXPECT().Get(ctx, "masterSubnetID").Return(&mgmtnetwork.Subnet{}, fmt.Errorf("subnet not found")) }, - wantResult: false, - wantErr: "subnet not found", + wantErr: "subnet not found", }, { name: "Failure - NSG already attached to a subnet", @@ -373,8 +370,7 @@ func TestAttachNSGs(t *testing.T) { }, }, nil) }, - wantResult: false, - wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.", + wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.", }, { name: "Failure - failed to CreateOrUpdate subnet because NSG not yet ready for use", @@ -407,7 +403,6 @@ func TestAttachNSGs(t *testing.T) { }, }).Return(fmt.Errorf("Some random stuff followed by the important part that we're trying to match: Resource /subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/aro-12345678/providers/Microsoft.Network/networkSecurityGroups/infra-nsg referenced by resource masterSubnetID was not found. and here's some more stuff that's at the end past the important part")) }, - wantResult: false, }, { name: "Failure - failed to CreateOrUpdate subnet with arbitrary error", @@ -440,8 +435,7 @@ func TestAttachNSGs(t *testing.T) { }, }).Return(fmt.Errorf("I'm an arbitrary error here to make life harder")) }, - wantResult: false, - wantErr: "I'm an arbitrary error here to make life harder", + wantErr: "I'm an arbitrary error here to make life harder", }, } { controller := gomock.NewController(t) @@ -456,10 +450,7 @@ func TestAttachNSGs(t *testing.T) { subnet: subnet, } - result, err := m.attachNSGs(ctx) - if result != tt.wantResult { - t.Errorf("Got %v, wanted %v", result, tt.wantResult) - } + err := m._attachNSGs(ctx, 1*time.Millisecond, 30*time.Second) utilerror.AssertErrorMessage(t, err, tt.wantErr) } } diff --git a/pkg/cluster/install.go b/pkg/cluster/install.go index b56b2b0aba7..660b3f75ecb 100644 --- a/pkg/cluster/install.go +++ b/pkg/cluster/install.go @@ -285,7 +285,7 @@ func (m *manager) bootstrap() []steps.Step { steps.Action(m.ensureServiceEndpoints), steps.Action(m.setMasterSubnetPolicies), steps.AuthorizationRetryingAction(m.fpAuthorizer, m.deployBaseResourceTemplate), - steps.Condition(m.attachNSGs, 3*time.Minute, true), + steps.AuthorizationRetryingAction(m.fpAuthorizer, m.attachNSGs), steps.Action(m.updateAPIIPEarly), steps.Action(m.createOrUpdateRouterIPEarly), steps.Action(m.ensureGatewayCreate), diff --git a/pkg/util/steps/refreshing.go b/pkg/util/steps/refreshing.go index a370101e4ae..96fbdf71f8a 100644 --- a/pkg/util/steps/refreshing.go +++ b/pkg/util/steps/refreshing.go @@ -83,7 +83,9 @@ func (s *authorizationRefreshingActionStep) run(ctx context.Context, log *logrus err = s.auth.Rebuild() return false, err // retry step } - log.Printf("non-auth error, giving up: %v", err) + if err != nil { + log.Printf("non-auth error, giving up: %v", err) + } return true, err }, timeoutCtx.Done())