From cbd7d531aa1e67b14291e43590c2a7e02d49e7fe Mon Sep 17 00:00:00 2001 From: Anvesh Reddy Pinnapureddy Date: Mon, 2 Sep 2024 14:20:45 +0530 Subject: [PATCH] rename configMap | add snapshotCount to spec.etcd to make the snapshot-count configurable | use proper url formatting for peer and client urls in etcd config --- api/v1alpha1/etcd.go | 4 ++++ api/v1alpha1/etcd_test.go | 18 +++++++++-------- api/v1alpha1/helper.go | 2 +- api/v1alpha1/helper_test.go | 2 +- api/v1alpha1/zz_generated.deepcopy.go | 5 +++++ .../crd-druid.gardener.cloud_etcds.yaml | 5 +++++ .../bases/crd-druid.gardener.cloud_etcds.yaml | 5 +++++ ...m-permanent-quorum-loss-in-etcd-cluster.md | 4 ++-- .../03-scaling-up-an-etcd-cluster.md | 2 +- .../component/configmap/configmap_test.go | 9 ++++----- internal/component/configmap/etcdconfig.go | 20 ++++++++++++------- test/e2e/etcd_backup_test.go | 4 ++-- test/e2e/utils.go | 6 ++++-- test/utils/etcd.go | 2 ++ 14 files changed, 59 insertions(+), 29 deletions(-) diff --git a/api/v1alpha1/etcd.go b/api/v1alpha1/etcd.go index 5b3a2cfed..6bf150d10 100644 --- a/api/v1alpha1/etcd.go +++ b/api/v1alpha1/etcd.go @@ -194,6 +194,10 @@ type EtcdConfig struct { // Quota defines the etcd DB quota. // +optional Quota *resource.Quantity `json:"quota,omitempty"` + // SnapshotCount defines the number of applied Raft entries to hold in-memory before compaction. + // More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention + // +optional + SnapshotCount *int64 `json:"snapshotCount,omitempty"` // DefragmentationSchedule defines the cron standard schedule for defragmentation of etcd. // +optional DefragmentationSchedule *string `json:"defragmentationSchedule,omitempty"` diff --git a/api/v1alpha1/etcd_test.go b/api/v1alpha1/etcd_test.go index 3d3651b12..adcb2cd3d 100644 --- a/api/v1alpha1/etcd_test.go +++ b/api/v1alpha1/etcd_test.go @@ -118,10 +118,11 @@ func TestIsReconciliationInProgress(t *testing.T) { func createEtcd(name, namespace string) *Etcd { var ( - clientPort int32 = 2379 - serverPort int32 = 2380 - backupPort int32 = 8080 - metricLevel = Basic + clientPort int32 = 2379 + serverPort int32 = 2380 + backupPort int32 = 8080 + metricLevel = Basic + snapshotCount int64 = 75000 ) garbageCollectionPeriod := metav1.Duration{ @@ -238,10 +239,11 @@ func createEtcd(name, namespace string) *Etcd { "memory": resource.MustParse("1000Mi"), }, }, - ClientPort: &clientPort, - ServerPort: &serverPort, - ClientUrlTLS: clientTlsConfig, - PeerUrlTLS: peerTlsConfig, + ClientPort: &clientPort, + ServerPort: &serverPort, + SnapshotCount: &snapshotCount, + ClientUrlTLS: clientTlsConfig, + PeerUrlTLS: peerTlsConfig, }, }, } diff --git a/api/v1alpha1/helper.go b/api/v1alpha1/helper.go index de3dcf04f..3c8fbc095 100644 --- a/api/v1alpha1/helper.go +++ b/api/v1alpha1/helper.go @@ -31,7 +31,7 @@ func GetServiceAccountName(etcdObjMeta metav1.ObjectMeta) string { // GetConfigMapName returns the name of the configmap for the Etcd. func GetConfigMapName(etcdObjMeta metav1.ObjectMeta) string { - return fmt.Sprintf("etcd-bootstrap-%s", string(etcdObjMeta.UID[:6])) + return fmt.Sprintf("%s-config-%s", etcdObjMeta.Name, (etcdObjMeta.UID[:6])) } // GetCompactionJobName returns the compaction job name for the Etcd. diff --git a/api/v1alpha1/helper_test.go b/api/v1alpha1/helper_test.go index cc9776279..a35fbabe7 100644 --- a/api/v1alpha1/helper_test.go +++ b/api/v1alpha1/helper_test.go @@ -54,7 +54,7 @@ func TestGetConfigMapName(t *testing.T) { uid := uuid.NewUUID() etcdObjMeta := createEtcdObjectMetadata(uid, nil, nil, false) configMapName := GetConfigMapName(etcdObjMeta) - g.Expect(configMapName).To(Equal("etcd-bootstrap-" + string(uid[:6]))) + g.Expect(configMapName).To(Equal(etcdObjMeta.Name + "-config-" + string(uid[:6]))) } func TestGetCompactionJobName(t *testing.T) { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index a94f34e03..72bc071d8 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -235,6 +235,11 @@ func (in *EtcdConfig) DeepCopyInto(out *EtcdConfig) { x := (*in).DeepCopy() *out = &x } + if in.SnapshotCount != nil { + in, out := &in.SnapshotCount, &out.SnapshotCount + *out = new(int64) + **out = **in + } if in.DefragmentationSchedule != nil { in, out := &in.DefragmentationSchedule, &out.DefragmentationSchedule *out = new(string) diff --git a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml index 49b507783..166df020b 100644 --- a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml +++ b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml @@ -597,6 +597,11 @@ spec: serverPort: format: int32 type: integer + snapshotCount: + description: 'SnapshotCount defines the number of applied Raft + entries to hold in-memory before compaction. More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention' + format: int64 + type: integer type: object labels: additionalProperties: diff --git a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml index 49b507783..166df020b 100644 --- a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml +++ b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml @@ -597,6 +597,11 @@ spec: serverPort: format: int32 type: integer + snapshotCount: + description: 'SnapshotCount defines the number of applied Raft + entries to hold in-memory before compaction. More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention' + format: int64 + type: integer type: object labels: additionalProperties: diff --git a/docs/operations/recovery-from-permanent-quorum-loss-in-etcd-cluster.md b/docs/operations/recovery-from-permanent-quorum-loss-in-etcd-cluster.md index 4ccc13285..8d94db546 100644 --- a/docs/operations/recovery-from-permanent-quorum-loss-in-etcd-cluster.md +++ b/docs/operations/recovery-from-permanent-quorum-loss-in-etcd-cluster.md @@ -31,7 +31,7 @@ Target the control plane of affected shoot cluster via `kubectl`. Alternatively, Volumes: etcd-config-file: Type: ConfigMap (a volume populated by a ConfigMap) - Name: etcd-bootstrap-4785b0 + Name: etcd-main-config-4785b0 Optional: false ``` @@ -68,7 +68,7 @@ Target the control plane of affected shoot cluster via `kubectl`. Alternatively, Delete all `etcd-main` member leases. -6. Edit the `etcd-main` cluster's configmap (ex: `etcd-bootstrap-4785b0`) as follows: +6. Edit the `etcd-main` cluster's configmap (ex: `etcd-main-config-4785b0`) as follows: Find the `initial-cluster` field in the configmap. It should look similar to the following: ``` diff --git a/docs/proposals/03-scaling-up-an-etcd-cluster.md b/docs/proposals/03-scaling-up-an-etcd-cluster.md index 998e15108..158245527 100644 --- a/docs/proposals/03-scaling-up-an-etcd-cluster.md +++ b/docs/proposals/03-scaling-up-an-etcd-cluster.md @@ -24,7 +24,7 @@ Now, it is detected whether peer URL was TLS enabled or not for single node etcd - If peer URL was not TLS enabled then etcd-druid has to intervene and make sure peer URL should be TLS enabled first for the single node before marking the cluster for scale-up. ## Action taken by etcd-druid to enable the peerURL TLS -1. Etcd-druid will update the `etcd-bootstrap` config-map with new config like initial-cluster,initial-advertise-peer-urls etc. Backup-restore will detect this change and update the member lease annotation to `member.etcd.gardener.cloud/tls-enabled: "true"`. +1. Etcd-druid will update the `{etcd.Name}-config` config-map with new config like initial-cluster,initial-advertise-peer-urls etc. Backup-restore will detect this change and update the member lease annotation to `member.etcd.gardener.cloud/tls-enabled: "true"`. 2. In case the peer URL TLS has been changed to `enabled`: Etcd-druid will add tasks to the deployment flow: - Check if peer TLS has been enabled for existing StatefulSet pods, by checking the member leases for the annotation `member.etcd.gardener.cloud/tls-enabled`. - If peer TLS enablement is pending for any of the members, then check and patch the StatefulSet with the peer TLS volume mounts, if not already patched. This will cause a rolling update of the existing StatefulSet pods, which allows etcd-backup-restore to update the member peer URL in the etcd cluster. diff --git a/internal/component/configmap/configmap_test.go b/internal/component/configmap/configmap_test.go index f3775c994..1101b8888 100644 --- a/internal/component/configmap/configmap_test.go +++ b/internal/component/configmap/configmap_test.go @@ -7,7 +7,6 @@ package configmap import ( "context" "fmt" - "strconv" "testing" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" @@ -344,7 +343,7 @@ func matchConfigMap(g *WithT, etcd *druidv1alpha1.Etcd, actualConfigMap corev1.C "name": Equal(fmt.Sprintf("etcd-%s", etcd.UID[:6])), "data-dir": Equal(fmt.Sprintf("%s/new.etcd", common.VolumeMountPathEtcdData)), "metrics": Equal(string(druidv1alpha1.Basic)), - "snapshot-count": Equal(int64(75000)), + "snapshot-count": Equal(ptr.Deref(etcd.Spec.Etcd.SnapshotCount, defaultSnapshotCount)), "enable-v2": Equal(false), "quota-backend-bytes": Equal(etcd.Spec.Etcd.Quota.Value()), "initial-cluster-token": Equal("etcd-cluster"), @@ -360,7 +359,7 @@ func matchClientTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actu if etcd.Spec.Etcd.ClientUrlTLS != nil { g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ "listen-client-urls": Equal(fmt.Sprintf("https://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient))), - "advertise-client-urls": Equal(fmt.Sprintf("https@%s@%s@%d", druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta), etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient))), + "advertise-client-urls": Equal(fmt.Sprintf("https://%s.%s:%d", druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta), etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient))), "client-transport-security": MatchKeys(IgnoreExtras, Keys{ "cert-file": Equal("/var/etcd/ssl/server/tls.crt"), "key-file": Equal("/var/etcd/ssl/server/tls.key"), @@ -389,12 +388,12 @@ func matchPeerTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actual "auto-tls": Equal(false), }), "listen-peer-urls": Equal(fmt.Sprintf("https://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), - "initial-advertise-peer-urls": Equal(fmt.Sprintf("https@%s@%s@%s", peerSvcName, etcd.Namespace, strconv.Itoa(int(ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))))), + "initial-advertise-peer-urls": Equal(fmt.Sprintf("https://%s.%s:%d", peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), })) } else { g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ "listen-peer-urls": Equal(fmt.Sprintf("http://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), - "initial-advertise-peer-urls": Equal(fmt.Sprintf("http@%s@%s@%s", peerSvcName, etcd.Namespace, strconv.Itoa(int(ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))))), + "initial-advertise-peer-urls": Equal(fmt.Sprintf("http://%s.%s:%d", peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), })) g.Expect(actualETCDConfig).ToNot(HaveKey("peer-transport-security")) } diff --git a/internal/component/configmap/etcdconfig.go b/internal/component/configmap/etcdconfig.go index de9c8409a..faeaea108 100644 --- a/internal/component/configmap/etcdconfig.go +++ b/internal/component/configmap/etcdconfig.go @@ -22,9 +22,7 @@ const ( defaultInitialClusterToken = "etcd-cluster" defaultInitialClusterState = "new" // For more information refer to https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention - // TODO: Ideally this should be made configurable via Etcd resource as this has a direct impact on the memory requirements for etcd container. - // which in turn is influenced by the size of objects that are getting stored in etcd. - defaultSnapshotCount = 75000 + defaultSnapshotCount = int64(75000) ) var ( @@ -42,7 +40,7 @@ type etcdConfig struct { Name string `yaml:"name"` DataDir string `yaml:"data-dir"` Metrics druidv1alpha1.MetricsLevel `yaml:"metrics"` - SnapshotCount int `yaml:"snapshot-count"` + SnapshotCount int64 `yaml:"snapshot-count"` EnableV2 bool `yaml:"enable-v2"` QuotaBackendBytes int64 `yaml:"quota-backend-bytes"` InitialClusterToken string `yaml:"initial-cluster-token"` @@ -74,7 +72,7 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { Name: fmt.Sprintf("etcd-%s", etcd.UID[:6]), DataDir: defaultDataDir, Metrics: ptr.Deref(etcd.Spec.Etcd.Metrics, druidv1alpha1.Basic), - SnapshotCount: defaultSnapshotCount, + SnapshotCount: getSnapshotCount(etcd), EnableV2: false, QuotaBackendBytes: getDBQuotaBytes(etcd), InitialClusterToken: defaultInitialClusterToken, @@ -84,8 +82,8 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { AutoCompactionRetention: ptr.Deref(etcd.Spec.Common.AutoCompactionRetention, defaultAutoCompactionRetention), ListenPeerUrls: fmt.Sprintf("%s://0.0.0.0:%d", peerScheme, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer)), ListenClientUrls: fmt.Sprintf("%s://0.0.0.0:%d", clientScheme, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient)), - AdvertisePeerUrls: fmt.Sprintf("%s@%s@%s@%d", peerScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer)), - AdvertiseClientUrls: fmt.Sprintf("%s@%s@%s@%d", clientScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient)), + AdvertisePeerUrls: fmt.Sprintf("%s://%s.%s:%d", peerScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer)), + AdvertiseClientUrls: fmt.Sprintf("%s://%s.%s:%d", clientScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient)), } if peerSecurityConfig != nil { cfg.PeerSecurity = *peerSecurityConfig @@ -97,6 +95,14 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { return cfg } +func getSnapshotCount(etcd *druidv1alpha1.Etcd) int64 { + snapshotCount := defaultSnapshotCount + if etcd.Spec.Etcd.SnapshotCount != nil { + snapshotCount = *etcd.Spec.Etcd.SnapshotCount + } + return snapshotCount +} + func getDBQuotaBytes(etcd *druidv1alpha1.Etcd) int64 { dbQuotaBytes := defaultDBQuotaBytes if etcd.Spec.Etcd.Quota != nil { diff --git a/test/e2e/etcd_backup_test.go b/test/e2e/etcd_backup_test.go index a4a2727bf..9dd840c2e 100644 --- a/test/e2e/etcd_backup_test.go +++ b/test/e2e/etcd_backup_test.go @@ -247,7 +247,7 @@ func checkEtcdReady(ctx context.Context, cl client.Client, logger logr.Logger, e logger.Info("Checking configmap") cm := &corev1.ConfigMap{} - ExpectWithOffset(2, cl.Get(ctx, client.ObjectKey{Name: "etcd-bootstrap-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, cm)).To(Succeed()) + ExpectWithOffset(2, cl.Get(ctx, client.ObjectKey{Name: etcd.Name + "-config-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, cm)).To(Succeed()) logger.Info("Checking client service") svc := &corev1.Service{} @@ -280,7 +280,7 @@ func deleteAndCheckEtcd(ctx context.Context, cl client.Client, logger logr.Logge ExpectWithOffset(1, cl.Get( ctx, - client.ObjectKey{Name: "etcd-bootstrap-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, + client.ObjectKey{Name: etcd.Name + "-config-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, &corev1.ConfigMap{}, ), ).Should(matchers.BeNotFoundError()) diff --git a/test/e2e/utils.go b/test/e2e/utils.go index c7a06315f..ddc7883d7 100644 --- a/test/e2e/utils.go +++ b/test/e2e/utils.go @@ -105,8 +105,9 @@ var ( "memory": resource.MustParse("256Mi"), }, } - etcdClientPort = int32(2379) - etcdServerPort = int32(2380) + etcdClientPort = int32(2379) + etcdServerPort = int32(2380) + etcdSnapshotCount = int64(75000) backupPort = int32(8080) backupFullSnapshotSchedule = "0 */1 * * *" @@ -182,6 +183,7 @@ func getDefaultEtcd(name, namespace, container, prefix string, provider TestProv Resources: &etcdResources, ClientPort: &etcdClientPort, ServerPort: &etcdServerPort, + SnapshotCount: &etcdSnapshotCount, ClientUrlTLS: &etcdTLS, } diff --git a/test/utils/etcd.go b/test/utils/etcd.go index fbfd2a752..b2c2658f6 100644 --- a/test/utils/etcd.go +++ b/test/utils/etcd.go @@ -40,6 +40,7 @@ var ( deltaSnapShotMemLimit = resource.MustParse("100Mi") autoCompactionMode = druidv1alpha1.Periodic autoCompactionRetention = "2m" + snapshotCount = int64(75000) quota = resource.MustParse("8Gi") localProvider = druidv1alpha1.StorageProvider("Local") prefix = "/tmp" @@ -387,6 +388,7 @@ func getDefaultEtcd(name, namespace string) *druidv1alpha1.Etcd { Backup: getBackupSpec(), Etcd: druidv1alpha1.EtcdConfig{ Quota: "a, + SnapshotCount: &snapshotCount, Metrics: &metricsBasic, Image: &imageEtcd, DefragmentationSchedule: &defragSchedule,