Skip to content

Commit

Permalink
Merge pull request #3403 from hawkowl/hawkowl/local-provision-fixes
Browse files Browse the repository at this point in the history
Fix failures provisioning a cluster in dev due to NSG attaching
  • Loading branch information
jaitaiwan authored Feb 21, 2024
2 parents 07135d8 + 527fe77 commit c02c83b
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 67 deletions.
132 changes: 86 additions & 46 deletions pkg/cluster/deploybaseresources.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ import (
"net/http"
"regexp"
"strings"
"time"

mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network"
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
"github.com/Azure/go-autorest/autorest"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/Azure/go-autorest/autorest/to"
utilrand "k8s.io/apimachinery/pkg/util/rand"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/Azure/ARO-RP/pkg/api"
apisubnet "github.com/Azure/ARO-RP/pkg/api/util/subnet"
Expand Down Expand Up @@ -254,65 +256,103 @@ func (m *manager) subnetsWithServiceEndpoint(ctx context.Context, serviceEndpoin
return subnets, nil
}

func (m *manager) attachNSGs(ctx context.Context) (bool, error) {
// attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not
// enabled. This method is suitable for use with steps, and has default
// timeout/polls set.
func (m *manager) attachNSGs(ctx context.Context) error {
// Since we need to guard against the case where NSGs are not ready
// immediately after creation, we can have a relatively short retry period
// of 30s and timeout of 3m. These numbers were chosen via a
// highly-non-specific and data-adjacent process (picking them because they
// seemed decent enough).
//
// If we get the NSG not-ready error after 3 minutes, it's unusual enough
// that we should be raising it as an issue rather than tolerating it.
return m._attachNSGs(ctx, 3*time.Minute, 30*time.Second)
}

// _attachNSGs attaches NSGs to the cluster subnets, if preconfigured NSG is not
// enabled. timeout and pollInterval are provided as arguments for testing
// reasons.
func (m *manager) _attachNSGs(ctx context.Context, timeout time.Duration, pollInterval time.Duration) error {
if m.doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled {
return true, nil
return nil
}
var innerErr error

workerProfiles, _ := api.GetEnrichedWorkerProfiles(m.doc.OpenShiftCluster.Properties)
workerSubnetId := workerProfiles[0].SubnetID

for _, subnetID := range []string{
m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID,
workerSubnetId,
} {
m.log.Printf("attaching network security group to subnet %s", subnetID)

// TODO: there is probably an undesirable race condition here - check if etags can help.

s, err := m.subnet.Get(ctx, subnetID)
if err != nil {
return false, err
}
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

// This polling function protects the case below where the NSG might not be
// ready to be referenced. We don't guard against trying to re-attach the
// NSG since the inner loop is tolerant of that, and since we are attaching
// the same NSG the only allowed failure case is when the NSG cannot be
// attached to begin with, so it shouldn't happen in practice.
_ = wait.PollImmediateUntil(pollInterval, func() (bool, error) {
var c bool
c, innerErr = func() (bool, error) {
for _, subnetID := range []string{
m.doc.OpenShiftCluster.Properties.MasterProfile.SubnetID,
workerSubnetId,
} {
m.log.Printf("attaching network security group to subnet %s", subnetID)
// TODO: there is probably an undesirable race condition here - check if etags can help.

// We use the outer context, not the timeout context, as we do not want
// to time out the condition function itself, only stop retrying once
// timeoutCtx's timeout has fired.
s, err := m.subnet.Get(ctx, subnetID)
if err != nil {
return false, err
}

if s.SubnetPropertiesFormat == nil {
s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{}
}
if s.SubnetPropertiesFormat == nil {
s.SubnetPropertiesFormat = &mgmtnetwork.SubnetPropertiesFormat{}
}

nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID)
if err != nil {
return false, err
}
nsgID, err := apisubnet.NetworkSecurityGroupID(m.doc.OpenShiftCluster, subnetID)
if err != nil {
return false, err
}

// Sometimes we get into the race condition between external services modifying
// subnets and our validation code. We try to catch this early, but
// these errors is propagated to make the user-facing error more clear incase
// modification happened after we ran validation code and we lost the race
if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil {
if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) {
continue
}
// Sometimes we get into the race condition between external services modifying
// subnets and our validation code. We try to catch this early, but
// these errors is propagated to make the user-facing error more clear incase
// modification happened after we ran validation code and we lost the race
if s.SubnetPropertiesFormat.NetworkSecurityGroup != nil {
if strings.EqualFold(*s.SubnetPropertiesFormat.NetworkSecurityGroup.ID, nsgID) {
continue
}

return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID)
}
return false, api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidLinkedVNet, "", "The provided subnet '%s' is invalid: must not have a network security group attached.", subnetID)
}

s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{
ID: to.StringPtr(nsgID),
}
s.SubnetPropertiesFormat.NetworkSecurityGroup = &mgmtnetwork.SecurityGroup{
ID: to.StringPtr(nsgID),
}

// Because we attempt to attach the NSG immediately after the base resource deployment
// finishes, the NSG is sometimes not yet ready to be referenced and used, causing
// an error to occur here. So if this particular error occurs, return nil to retry.
// But if some other type of error occurs, just return that error.
err = m.subnet.CreateOrUpdate(ctx, subnetID, s)
if err != nil {
if nsgNotReadyErrorRegex.MatchString(err.Error()) {
return false, nil
// Because we attempt to attach the NSG immediately after the base resource deployment
// finishes, the NSG is sometimes not yet ready to be referenced and used, causing
// an error to occur here. So if this particular error occurs, return nil to retry.
// But if some other type of error occurs, just return that error.
err = m.subnet.CreateOrUpdate(ctx, subnetID, s)
if err != nil {
if nsgNotReadyErrorRegex.MatchString(err.Error()) {
return false, nil
}
return false, err
}
}
return false, err
}
}
return true, nil
}()

return c, innerErr
}, timeoutCtx.Done())

return true, nil
return innerErr
}

func (m *manager) setMasterSubnetPolicies(ctx context.Context) error {
Expand Down
29 changes: 10 additions & 19 deletions pkg/cluster/deploybaseresources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sort"
"strings"
"testing"
"time"

mgmtnetwork "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2020-08-01/network"
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
Expand Down Expand Up @@ -242,11 +243,10 @@ func TestAttachNSGs(t *testing.T) {
ctx := context.Background()

for _, tt := range []struct {
name string
oc *api.OpenShiftClusterDocument
mocks func(*mock_subnet.MockManager)
wantResult bool
wantErr string
name string
oc *api.OpenShiftClusterDocument
mocks func(*mock_subnet.MockManager)
wantErr string
}{
{
name: "Success - NSG attached to both subnets",
Expand Down Expand Up @@ -287,7 +287,6 @@ func TestAttachNSGs(t *testing.T) {
},
}).Return(nil)
},
wantResult: true,
},
{
name: "Success - preconfigured NSG enabled",
Expand All @@ -313,8 +312,7 @@ func TestAttachNSGs(t *testing.T) {
},
},
},
mocks: func(subnet *mock_subnet.MockManager) {},
wantResult: true,
mocks: func(subnet *mock_subnet.MockManager) {},
},
{
name: "Failure - unable to get a subnet",
Expand All @@ -340,8 +338,7 @@ func TestAttachNSGs(t *testing.T) {
mocks: func(subnet *mock_subnet.MockManager) {
subnet.EXPECT().Get(ctx, "masterSubnetID").Return(&mgmtnetwork.Subnet{}, fmt.Errorf("subnet not found"))
},
wantResult: false,
wantErr: "subnet not found",
wantErr: "subnet not found",
},
{
name: "Failure - NSG already attached to a subnet",
Expand Down Expand Up @@ -373,8 +370,7 @@ func TestAttachNSGs(t *testing.T) {
},
}, nil)
},
wantResult: false,
wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.",
wantErr: "400: InvalidLinkedVNet: : The provided subnet 'masterSubnetID' is invalid: must not have a network security group attached.",
},
{
name: "Failure - failed to CreateOrUpdate subnet because NSG not yet ready for use",
Expand Down Expand Up @@ -407,7 +403,6 @@ func TestAttachNSGs(t *testing.T) {
},
}).Return(fmt.Errorf("Some random stuff followed by the important part that we're trying to match: Resource /subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/aro-12345678/providers/Microsoft.Network/networkSecurityGroups/infra-nsg referenced by resource masterSubnetID was not found. and here's some more stuff that's at the end past the important part"))
},
wantResult: false,
},
{
name: "Failure - failed to CreateOrUpdate subnet with arbitrary error",
Expand Down Expand Up @@ -440,8 +435,7 @@ func TestAttachNSGs(t *testing.T) {
},
}).Return(fmt.Errorf("I'm an arbitrary error here to make life harder"))
},
wantResult: false,
wantErr: "I'm an arbitrary error here to make life harder",
wantErr: "I'm an arbitrary error here to make life harder",
},
} {
controller := gomock.NewController(t)
Expand All @@ -456,10 +450,7 @@ func TestAttachNSGs(t *testing.T) {
subnet: subnet,
}

result, err := m.attachNSGs(ctx)
if result != tt.wantResult {
t.Errorf("Got %v, wanted %v", result, tt.wantResult)
}
err := m._attachNSGs(ctx, 1*time.Millisecond, 30*time.Second)
utilerror.AssertErrorMessage(t, err, tt.wantErr)
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cluster/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ func (m *manager) bootstrap() []steps.Step {
steps.Action(m.ensureServiceEndpoints),
steps.Action(m.setMasterSubnetPolicies),
steps.AuthorizationRetryingAction(m.fpAuthorizer, m.deployBaseResourceTemplate),
steps.Condition(m.attachNSGs, 3*time.Minute, true),
steps.AuthorizationRetryingAction(m.fpAuthorizer, m.attachNSGs),
steps.Action(m.updateAPIIPEarly),
steps.Action(m.createOrUpdateRouterIPEarly),
steps.Action(m.ensureGatewayCreate),
Expand Down
4 changes: 3 additions & 1 deletion pkg/util/steps/refreshing.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ func (s *authorizationRefreshingActionStep) run(ctx context.Context, log *logrus
err = s.auth.Rebuild()
return false, err // retry step
}
log.Printf("non-auth error, giving up: %v", err)
if err != nil {
log.Printf("non-auth error, giving up: %v", err)
}
return true, err
}, timeoutCtx.Done())

Expand Down

0 comments on commit c02c83b

Please sign in to comment.