Skip to content

Commit

Permalink
Avoid AdminUpdate panic when Nodes are down (#1972)
Browse files Browse the repository at this point in the history
* Skip ensureAROOperator and aroDeploymentReady when the IngressProfiles data is missing, esp after cluster VM restarts as part of the update call
* Refactor Cluster Manager code to make ensureAROOperator code testable
* Add unit test for ensureAROOperator code

Co-authored-by: Ulrich Schlueter <[email protected]>
  • Loading branch information
UlrichSchlueter and Ulrich Schlueter authored Mar 16, 2022
1 parent 02d429a commit f4f2b72
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 11 deletions.
3 changes: 3 additions & 0 deletions pkg/cluster/adminupdate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func TestAdminUpdateSteps(t *testing.T) {
},
shouldRunSteps: []string{
"[Action initializeKubernetesClients-fm]",
"[Action initializeOperatorDeployer-fm]",
"[Action ensureBillingRecord-fm]",
"[Action ensureDefaults-fm]",
"[Action fixupClusterSPObjectID-fm]",
Expand All @@ -61,6 +62,7 @@ func TestAdminUpdateSteps(t *testing.T) {
},
shouldRunSteps: []string{
"[Action initializeKubernetesClients-fm]",
"[Action initializeOperatorDeployer-fm]",
"[Action ensureBillingRecord-fm]",
"[Action ensureDefaults-fm]",
"[Action fixupClusterSPObjectID-fm]",
Expand Down Expand Up @@ -99,6 +101,7 @@ func TestAdminUpdateSteps(t *testing.T) {
},
shouldRunSteps: []string{
"[Action initializeKubernetesClients-fm]",
"[Action initializeOperatorDeployer-fm]",
"[Action ensureBillingRecord-fm]",
"[Action ensureDefaults-fm]",
"[Action fixupClusterSPObjectID-fm]",
Expand Down
29 changes: 18 additions & 11 deletions pkg/cluster/arooperator.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,34 @@ package cluster

import (
"context"

"github.com/Azure/ARO-RP/pkg/operator/deploy"
)

func (m *manager) isIngressProfileAvailable() bool {
// We try to aqcuire the IngressProfiles data at frontend best effort enrichment time only.
// When we start deallocated VMs and wait for the API do become available again, we don't pick
// the information up, even though it would be available.
return len(m.doc.OpenShiftCluster.Properties.IngressProfiles) != 0
}

func (m *manager) ensureAROOperator(ctx context.Context) error {
dep, err := deploy.New(m.log, m.env, m.doc.OpenShiftCluster, m.arocli, m.extensionscli, m.kubernetescli)
if err != nil {
m.log.Errorf("cannot ensureAROOperator.New: %s", err.Error())
return err
//ensure the IngressProfile information is available from the cluster which is not the case when the cluster vms were freshly restarted.
if !m.isIngressProfileAvailable() {
m.log.Error("skip ensureAROOperator")
return nil
}
err = dep.CreateOrUpdate(ctx)

err := m.aroOperatorDeployer.CreateOrUpdate(ctx)
if err != nil {
m.log.Errorf("cannot ensureAROOperator.CreateOrUpdate: %s", err.Error())
}
return err
}

func (m *manager) aroDeploymentReady(ctx context.Context) (bool, error) {
dep, err := deploy.New(m.log, m.env, m.doc.OpenShiftCluster, m.arocli, m.extensionscli, m.kubernetescli)
if err != nil {
return false, err
if !m.isIngressProfileAvailable() {
m.log.Error("skip aroDeploymentReady")
// skip and don't retry
return true, nil
}
return dep.IsReady(ctx)
return m.aroOperatorDeployer.IsReady(ctx)
}
204 changes: 204 additions & 0 deletions pkg/cluster/arooperator_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
package cluster

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"errors"
"strings"
"testing"

"github.com/golang/mock/gomock"
"github.com/sirupsen/logrus"

"github.com/Azure/ARO-RP/pkg/api"
mock_deploy "github.com/Azure/ARO-RP/pkg/util/mocks/operator/deploy"
)

func TestEnsureAROOperator(t *testing.T) {
ctx := context.Background()

const (
key = "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/resourceGroup/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName1"
)

for _, tt := range []struct {
name string
doc *api.OpenShiftClusterDocument
mocks func(*mock_deploy.MockOperator)
wantErr string
}{
{
name: "create/update success",
doc: &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
IngressProfiles: []api.IngressProfile{
{
Visibility: api.VisibilityPublic,
Name: "default",
},
},
ProvisioningState: api.ProvisioningStateAdminUpdating,
ClusterProfile: api.ClusterProfile{
Version: "4.8.18",
},
NetworkProfile: api.NetworkProfile{
APIServerPrivateEndpointIP: "1.2.3.4",
},
},
},
},
mocks: func(dep *mock_deploy.MockOperator) {
dep.EXPECT().
CreateOrUpdate(gomock.Any()).
Return(nil)
},
},
{
name: "create/update failure",
doc: &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
IngressProfiles: []api.IngressProfile{
{
Visibility: api.VisibilityPublic,
Name: "default",
},
},
ProvisioningState: api.ProvisioningStateAdminUpdating,
ClusterProfile: api.ClusterProfile{
Version: "4.8.18",
},
NetworkProfile: api.NetworkProfile{
APIServerPrivateEndpointIP: "1.2.3.4",
},
},
},
},
mocks: func(dep *mock_deploy.MockOperator) {
dep.EXPECT().
CreateOrUpdate(gomock.Any()).
Return(errors.New("Mock return: CreateFailed"))
},

wantErr: "Mock return: CreateFailed",
},
{
name: "enriched data not available - skip",

doc: &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
IngressProfiles: nil,
ProvisioningState: api.ProvisioningStateAdminUpdating,
},
},
},
},
} {
t.Run(tt.name, func(t *testing.T) {
controller := gomock.NewController(t)
defer controller.Finish()

dep := mock_deploy.NewMockOperator(controller)
if tt.mocks != nil {
tt.mocks(dep)
}

m := &manager{
log: logrus.NewEntry(logrus.StandardLogger()),
doc: tt.doc,

aroOperatorDeployer: dep,
}

err := m.ensureAROOperator(ctx)
if err != nil && err.Error() != tt.wantErr ||
err == nil && tt.wantErr != "" {
t.Error(err)
}
})
}
}

func TestAroDeploymentReady(t *testing.T) {
ctx := context.Background()

const (
key = "/subscriptions/00000000-0000-0000-0000-000000000000/resourcegroups/resourceGroup/providers/Microsoft.RedHatOpenShift/openShiftClusters/resourceName1"
)

for _, tt := range []struct {
name string
doc *api.OpenShiftClusterDocument
mocks func(*mock_deploy.MockOperator)
wantRes bool
}{
{
name: "create/update success",
doc: &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
IngressProfiles: []api.IngressProfile{
{
Visibility: api.VisibilityPublic,
Name: "default",
},
},
},
},
},
mocks: func(dep *mock_deploy.MockOperator) {
dep.EXPECT().
IsReady(gomock.Any()).
Return(true, nil)
},
wantRes: true,
},
{
name: "enriched data not available - skip",
doc: &api.OpenShiftClusterDocument{
Key: strings.ToLower(key),
OpenShiftCluster: &api.OpenShiftCluster{
ID: key,
Properties: api.OpenShiftClusterProperties{
IngressProfiles: nil,
},
},
},
wantRes: true,
},
} {
t.Run(tt.name, func(t *testing.T) {
controller := gomock.NewController(t)
defer controller.Finish()

dep := mock_deploy.NewMockOperator(controller)
if tt.mocks != nil {
tt.mocks(dep)
}

m := &manager{
log: logrus.NewEntry(logrus.StandardLogger()),
doc: tt.doc,
aroOperatorDeployer: dep,
}

ok, err := m.aroDeploymentReady(ctx)
if err != nil || ok != tt.wantRes {
t.Error(err)
}

})
}
}
3 changes: 3 additions & 0 deletions pkg/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/Azure/ARO-RP/pkg/database"
"github.com/Azure/ARO-RP/pkg/env"
aroclient "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned"
"github.com/Azure/ARO-RP/pkg/operator/deploy"
"github.com/Azure/ARO-RP/pkg/util/azureclient/graphrbac"
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/authorization"
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/compute"
Expand Down Expand Up @@ -90,6 +91,8 @@ type manager struct {
securitycli securityclient.Interface
arocli aroclient.Interface
imageregistrycli imageregistryclient.Interface

aroOperatorDeployer deploy.Operator
}

// New returns a cluster manager
Expand Down
12 changes: 12 additions & 0 deletions pkg/cluster/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/Azure/ARO-RP/pkg/api"
aroclient "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned"
"github.com/Azure/ARO-RP/pkg/operator/deploy"
"github.com/Azure/ARO-RP/pkg/util/restconfig"
"github.com/Azure/ARO-RP/pkg/util/steps"
"github.com/Azure/ARO-RP/pkg/util/version"
Expand All @@ -42,6 +43,7 @@ func (m *manager) adminUpdate() []steps.Step {
// don't require a running cluster
toRun := []steps.Step{
steps.Action(m.initializeKubernetesClients), // must be first
steps.Action(m.initializeOperatorDeployer), // depends on kube clients
steps.Action(m.ensureBillingRecord), // belt and braces
steps.Action(m.ensureDefaults),
steps.Action(m.fixupClusterSPObjectID),
Expand Down Expand Up @@ -107,6 +109,7 @@ func (m *manager) Update(ctx context.Context) error {
steps := []steps.Step{
steps.AuthorizationRefreshingAction(m.fpAuthorizer, steps.Action(m.validateResources)),
steps.Action(m.initializeKubernetesClients), // All init steps are first
steps.Action(m.initializeOperatorDeployer), // depends on kube clients
steps.Action(m.initializeClusterSPClients),
steps.Action(m.clusterSPObjectID),
// credentials rotation flow steps
Expand Down Expand Up @@ -163,12 +166,14 @@ func (m *manager) Install(ctx context.Context) error {
steps.Action(m.createAPIServerPrivateEndpoint),
steps.Action(m.createCertificates),
steps.Action(m.initializeKubernetesClients),
steps.Action(m.initializeOperatorDeployer), // depends on kube clients
steps.Condition(m.bootstrapConfigMapReady, 30*time.Minute, true),
steps.Action(m.ensureAROOperator),
steps.Action(m.incrInstallPhase),
},
api.InstallPhaseRemoveBootstrap: {
steps.Action(m.initializeKubernetesClients),
steps.Action(m.initializeOperatorDeployer), // depends on kube clients
steps.Action(m.removeBootstrap),
steps.Action(m.removeBootstrapIgnition),
steps.Action(m.configureAPIServerCertificate),
Expand Down Expand Up @@ -296,6 +301,13 @@ func (m *manager) initializeKubernetesClients(ctx context.Context) error {
return err
}

// initializeKubernetesClients initializes clients which are used
// once the cluster is up later on in the install process.
func (m *manager) initializeOperatorDeployer(ctx context.Context) (err error) {
m.aroOperatorDeployer, err = deploy.New(m.log, m.env, m.doc.OpenShiftCluster, m.arocli, m.extensionscli, m.kubernetescli)
return
}

// updateProvisionedBy sets the deploying resource provider version in
// the cluster document for deployment-tracking purposes.
func (m *manager) updateProvisionedBy(ctx context.Context) error {
Expand Down
8 changes: 8 additions & 0 deletions pkg/operator/deploy/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package deploy

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

//go:generate rm -rf ../../util/mocks/operator/$GOPACKAGE
//go:generate go run ../../../vendor/github.com/golang/mock/mockgen -destination=../../util/mocks/operator/$GOPACKAGE/$GOPACKAGE.go github.com/Azure/ARO-RP/pkg/operator/$GOPACKAGE Operator
//go:generate go run ../../../vendor/golang.org/x/tools/cmd/goimports -local=github.com/Azure/ARO-RP -e -w ../../util/mocks/operator/$GOPACKAGE/$GOPACKAGE.go
Loading

0 comments on commit f4f2b72

Please sign in to comment.