From 508ce328ef1a6531f676e13023932253ee814768 Mon Sep 17 00:00:00 2001 From: Tanmay Satam Date: Wed, 10 Apr 2024 09:58:50 -0400 Subject: [PATCH 1/3] Do not perform explicit cluster deletion in prod e2e --- pkg/util/cluster/cluster.go | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pkg/util/cluster/cluster.go b/pkg/util/cluster/cluster.go index fdf483b6349..aa94240a6d7 100644 --- a/pkg/util/cluster/cluster.go +++ b/pkg/util/cluster/cluster.go @@ -390,15 +390,23 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str if err != nil { errs = append(errs, err) } - - c.log.Print("deleting cluster") - err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) - if err != nil { - errs = append(errs, err) - } } if c.ci { + // Only perform explicit cluster deletion when in local development mode, otherwise + // we let the resource group deletion performed below clean up the cluster. + // + // When the cluster resource is known by ARM (e.g. prod e2e), deleting the cluster + // here can result in race conditions when deleting the resource group below, + // as ARM will also attempt to delete the cluster. + if env.IsLocalDevelopmentMode() { + c.log.Print("deleting cluster") + err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) + if err != nil { + errs = append(errs, err) + } + } + _, err = c.groups.Get(ctx, vnetResourceGroup) if err == nil { c.log.Print("deleting resource group") @@ -418,6 +426,12 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str } } } else { + c.log.Print("deleting cluster") + err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) + if err != nil { + errs = append(errs, err) + } + // Deleting the deployment does not clean up the associated resources c.log.Info("deleting deployment") err = c.deployments.DeleteAndWait(ctx, vnetResourceGroup, clusterName) From 85a843581cc44b4c2e401d15ce0662c09deaa192 Mon Sep 17 00:00:00 2001 From: Tanmay Satam Date: Wed, 10 Apr 2024 16:24:19 -0400 Subject: [PATCH 2/3] Add more robust logging to deletion process --- pkg/util/cluster/cluster.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pkg/util/cluster/cluster.go b/pkg/util/cluster/cluster.go index aa94240a6d7..683fa1efa7a 100644 --- a/pkg/util/cluster/cluster.go +++ b/pkg/util/cluster/cluster.go @@ -388,6 +388,7 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str c.log.Print("deleting role assignments") err = c.deleteRoleAssignments(ctx, vnetResourceGroup, *oc.OpenShiftClusterProperties.ServicePrincipalProfile.ClientID) if err != nil { + c.log.Errorf("error when deleting role assignments: %v", err) errs = append(errs, err) } } @@ -403,6 +404,7 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str c.log.Print("deleting cluster") err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) if err != nil { + c.log.Errorf("error when deleting cluster: %v", err) errs = append(errs, err) } } @@ -412,6 +414,7 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str c.log.Print("deleting resource group") err = c.groups.DeleteAndWait(ctx, vnetResourceGroup) if err != nil { + c.log.Errorf("error when deleting resource group: %v", err) errs = append(errs, err) } } @@ -422,6 +425,7 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str err = c.ciParentVnetPeerings.DeleteAndWait(ctx, r.ResourceGroup, r.ResourceName, vnetResourceGroup+"-peer") } if err != nil { + c.log.Errorf("error when deleting vnet peerings: %v", err) errs = append(errs, err) } } @@ -429,6 +433,7 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str c.log.Print("deleting cluster") err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) if err != nil { + c.log.Errorf("error when deleting cluster: %v", err) errs = append(errs, err) } @@ -436,33 +441,37 @@ func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName str c.log.Info("deleting deployment") err = c.deployments.DeleteAndWait(ctx, vnetResourceGroup, clusterName) if err != nil { + c.log.Errorf("error when deleting deployment: %v", err) errs = append(errs, err) } c.log.Info("deleting master/worker subnets") err = c.subnets.DeleteAndWait(ctx, vnetResourceGroup, "dev-vnet", clusterName+"-master") if err != nil { + c.log.Errorf("error when deleting master subnet: %v", err) errs = append(errs, err) } err = c.subnets.DeleteAndWait(ctx, vnetResourceGroup, "dev-vnet", clusterName+"-worker") if err != nil { + c.log.Errorf("error when deleting worker subnet: %v", err) errs = append(errs, err) } c.log.Info("deleting route table") err = c.routetables.DeleteAndWait(ctx, vnetResourceGroup, clusterName+"-rt") if err != nil { + c.log.Errorf("error when deleting route table: %v", err) errs = append(errs, err) } } - c.log.Info("done") - if errs != nil { + c.log.Error("done, errors encountered") return errs // https://golang.org/doc/faq#nil_error } + c.log.Info("done") return nil } From d69240c34fe572ec82aa5e319204c099f4b0293c Mon Sep 17 00:00:00 2001 From: Tanmay Satam Date: Wed, 17 Apr 2024 17:29:52 -0400 Subject: [PATCH 3/3] Refactor pkg/util/cluster Delete function to declaratively define deletion steps --- pkg/util/cluster/cluster.go | 206 +++++++++++++++++------------------- 1 file changed, 99 insertions(+), 107 deletions(-) diff --git a/pkg/util/cluster/cluster.go b/pkg/util/cluster/cluster.go index 683fa1efa7a..245060d2a88 100644 --- a/pkg/util/cluster/cluster.go +++ b/pkg/util/cluster/cluster.go @@ -8,6 +8,7 @@ import ( "context" "crypto/tls" "encoding/json" + "errors" "fmt" "math/rand" "net/http" @@ -70,19 +71,6 @@ type Cluster struct { vaultsClient keyvaultclient.VaultsClient } -type errors []error - -func (errs errors) Error() string { - var sb strings.Builder - - for _, err := range errs { - sb.WriteString(err.Error()) - sb.WriteByte('\n') - } - - return sb.String() -} - func New(log *logrus.Entry, environment env.Core, ci bool) (*Cluster, error) { if env.IsLocalDevelopmentMode() { if err := env.ValidateVars("AZURE_FP_CLIENT_ID"); err != nil { @@ -381,98 +369,32 @@ func (c *Cluster) generateSubnets() (vnetPrefix string, masterSubnet string, wor } func (c *Cluster) Delete(ctx context.Context, vnetResourceGroup, clusterName string) error { - var errs errors - - oc, err := c.openshiftclustersv20200430.Get(ctx, vnetResourceGroup, clusterName) - if err == nil { - c.log.Print("deleting role assignments") - err = c.deleteRoleAssignments(ctx, vnetResourceGroup, *oc.OpenShiftClusterProperties.ServicePrincipalProfile.ClientID) - if err != nil { - c.log.Errorf("error when deleting role assignments: %v", err) - errs = append(errs, err) - } - } - - if c.ci { - // Only perform explicit cluster deletion when in local development mode, otherwise - // we let the resource group deletion performed below clean up the cluster. - // - // When the cluster resource is known by ARM (e.g. prod e2e), deleting the cluster - // here can result in race conditions when deleting the resource group below, - // as ARM will also attempt to delete the cluster. - if env.IsLocalDevelopmentMode() { - c.log.Print("deleting cluster") - err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) - if err != nil { - c.log.Errorf("error when deleting cluster: %v", err) - errs = append(errs, err) - } - } - - _, err = c.groups.Get(ctx, vnetResourceGroup) - if err == nil { - c.log.Print("deleting resource group") - err = c.groups.DeleteAndWait(ctx, vnetResourceGroup) - if err != nil { - c.log.Errorf("error when deleting resource group: %v", err) - errs = append(errs, err) - } - } - // Only delete peering if CI=true and RP_MODE=development - if env.IsLocalDevelopmentMode() { - r, err := azure.ParseResourceID(c.ciParentVnet) - if err == nil { - err = c.ciParentVnetPeerings.DeleteAndWait(ctx, r.ResourceGroup, r.ResourceName, vnetResourceGroup+"-peer") - } - if err != nil { - c.log.Errorf("error when deleting vnet peerings: %v", err) - errs = append(errs, err) - } - } - } else { - c.log.Print("deleting cluster") - err = c.openshiftclustersv20200430.DeleteAndWait(ctx, vnetResourceGroup, clusterName) - if err != nil { - c.log.Errorf("error when deleting cluster: %v", err) - errs = append(errs, err) - } - - // Deleting the deployment does not clean up the associated resources - c.log.Info("deleting deployment") - err = c.deployments.DeleteAndWait(ctx, vnetResourceGroup, clusterName) - if err != nil { - c.log.Errorf("error when deleting deployment: %v", err) - errs = append(errs, err) - } - - c.log.Info("deleting master/worker subnets") - err = c.subnets.DeleteAndWait(ctx, vnetResourceGroup, "dev-vnet", clusterName+"-master") - if err != nil { - c.log.Errorf("error when deleting master subnet: %v", err) - errs = append(errs, err) - } - - err = c.subnets.DeleteAndWait(ctx, vnetResourceGroup, "dev-vnet", clusterName+"-worker") - if err != nil { - c.log.Errorf("error when deleting worker subnet: %v", err) - errs = append(errs, err) - } - - c.log.Info("deleting route table") - err = c.routetables.DeleteAndWait(ctx, vnetResourceGroup, clusterName+"-rt") - if err != nil { - c.log.Errorf("error when deleting route table: %v", err) - errs = append(errs, err) - } - } - - if errs != nil { - c.log.Error("done, errors encountered") - return errs // https://golang.org/doc/faq#nil_error + var errs []error + + switch { + case c.ci && env.IsLocalDevelopmentMode(): // PR E2E + errs = append(errs, + c.deleteRoleAssignments(ctx, vnetResourceGroup, clusterName), + c.deleteCluster(ctx, vnetResourceGroup, clusterName), + c.deleteClusterResourceGroup(ctx, vnetResourceGroup), + c.deleteVnetPeerings(ctx, vnetResourceGroup), + ) + case c.ci: // Prod E2E + errs = append(errs, + c.deleteRoleAssignments(ctx, vnetResourceGroup, clusterName), + c.deleteClusterResourceGroup(ctx, vnetResourceGroup), + ) + default: + errs = append(errs, + c.deleteRoleAssignments(ctx, vnetResourceGroup, clusterName), + c.deleteCluster(ctx, vnetResourceGroup, clusterName), + c.deleteDeployment(ctx, vnetResourceGroup, clusterName), // Deleting the deployment does not clean up the associated resources + c.deleteVnetResources(ctx, vnetResourceGroup, "dev-vnet", clusterName), + ) } c.log.Info("done") - return nil + return errors.Join(errs...) } // createCluster created new clusters, based on where it is running. @@ -670,10 +592,15 @@ func (c *Cluster) fixupNSGs(ctx context.Context, vnetResourceGroup, clusterName return nil } -func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, appID string) error { - spObjID, err := utilgraph.GetServicePrincipalIDByAppID(ctx, c.spGraphClient, appID) +func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, clusterName string) error { + c.log.Print("deleting role assignments") + oc, err := c.openshiftclustersv20200430.Get(ctx, vnetResourceGroup, clusterName) if err != nil { - return err + return fmt.Errorf("error getting cluster document: %w", err) + } + spObjID, err := utilgraph.GetServicePrincipalIDByAppID(ctx, c.spGraphClient, *oc.OpenShiftClusterProperties.ServicePrincipalProfile.ClientID) + if err != nil { + return fmt.Errorf("error getting service principal for cluster: %w", err) } if spObjID == nil { return nil @@ -681,7 +608,7 @@ func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, roleAssignments, err := c.roleassignments.ListForResourceGroup(ctx, vnetResourceGroup, fmt.Sprintf("principalId eq '%s'", *spObjID)) if err != nil { - return err + return fmt.Errorf("error listing role assignments for service principal: %w", err) } for _, roleAssignment := range roleAssignments { @@ -693,7 +620,7 @@ func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, c.log.Infof("deleting role assignment %s", *roleAssignment.Name) _, err = c.roleassignments.Delete(ctx, *roleAssignment.Scope, *roleAssignment.Name) if err != nil { - return err + return fmt.Errorf("error deleting role assignment %s: %w", *roleAssignment.Name, err) } } } @@ -701,6 +628,71 @@ func (c *Cluster) deleteRoleAssignments(ctx context.Context, vnetResourceGroup, return nil } +func (c *Cluster) deleteCluster(ctx context.Context, resourceGroup, clusterName string) error { + c.log.Printf("deleting cluster %s", clusterName) + if err := c.openshiftclustersv20200430.DeleteAndWait(ctx, resourceGroup, clusterName); err != nil { + return fmt.Errorf("error deleting cluster %s: %w", clusterName, err) + } + return nil +} + +func (c *Cluster) deleteClusterResourceGroup(ctx context.Context, resourceGroup string) error { + if _, err := c.groups.Get(ctx, resourceGroup); err != nil { + c.log.Printf("error getting resource group %s, skipping deletion: %v", resourceGroup, err) + return nil + } + + c.log.Printf("deleting resource group %s", resourceGroup) + if err := c.groups.DeleteAndWait(ctx, resourceGroup); err != nil { + return fmt.Errorf("error deleting resource group: %w", err) + } + + return nil +} + +func (c *Cluster) deleteVnetPeerings(ctx context.Context, resourceGroup string) error { + r, err := azure.ParseResourceID(c.ciParentVnet) + if err == nil { + err = c.ciParentVnetPeerings.DeleteAndWait(ctx, r.ResourceGroup, r.ResourceName, resourceGroup+"-peer") + } + if err != nil { + return fmt.Errorf("error deleting vnet peerings: %w", err) + } + + return nil +} + +func (c *Cluster) deleteDeployment(ctx context.Context, resourceGroup, clusterName string) error { + c.log.Info("deleting deployment") + if err := c.deployments.DeleteAndWait(ctx, resourceGroup, clusterName); err != nil { + return fmt.Errorf("error deleting deployment: %w", err) + } + return nil +} + +func (c *Cluster) deleteVnetResources(ctx context.Context, resourceGroup, vnetName, clusterName string) error { + var errs []error + + c.log.Info("deleting master/worker subnets") + if err := c.subnets.DeleteAndWait(ctx, resourceGroup, vnetName, clusterName+"-master"); err != nil { + c.log.Errorf("error when deleting master subnet: %v", err) + errs = append(errs, err) + } + + if err := c.subnets.DeleteAndWait(ctx, resourceGroup, vnetName, clusterName+"-worker"); err != nil { + c.log.Errorf("error when deleting worker subnet: %v", err) + errs = append(errs, err) + } + + c.log.Info("deleting route table") + if err := c.routetables.DeleteAndWait(ctx, resourceGroup, clusterName+"-rt"); err != nil { + c.log.Errorf("error when deleting route table: %v", err) + errs = append(errs, err) + } + + return errors.Join(errs...) +} + func (c *Cluster) peerSubnetsToCI(ctx context.Context, vnetResourceGroup, clusterName string) error { cluster := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/dev-vnet", c.env.SubscriptionID(), vnetResourceGroup)