From dc3447fdc4a8d4751db7209bce2a489e63a81184 Mon Sep 17 00:00:00 2001 From: yaroslavborbat Date: Mon, 2 Sep 2024 11:14:27 +0300 Subject: [PATCH 1/3] add patch Signed-off-by: yaroslavborbat --- .../patches/014-delete-service-monitor.patch | 569 ++++++++++++++++++ images/cdi-artifact/patches/README.md | 4 + 2 files changed, 573 insertions(+) create mode 100644 images/cdi-artifact/patches/014-delete-service-monitor.patch diff --git a/images/cdi-artifact/patches/014-delete-service-monitor.patch b/images/cdi-artifact/patches/014-delete-service-monitor.patch new file mode 100644 index 000000000..95368e703 --- /dev/null +++ b/images/cdi-artifact/patches/014-delete-service-monitor.patch @@ -0,0 +1,569 @@ +diff --git a/cmd/cdi-operator/operator.go b/cmd/cdi-operator/operator.go +index 211f8cf70..4d166b992 100644 +--- a/cmd/cdi-operator/operator.go ++++ b/cmd/cdi-operator/operator.go +@@ -26,7 +26,6 @@ import ( + ocpconfigv1 "github.com/openshift/api/config/v1" + routev1 "github.com/openshift/api/route/v1" + secv1 "github.com/openshift/api/security/v1" +- promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "go.uber.org/zap/zapcore" + extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apiruntime "k8s.io/apimachinery/pkg/runtime" +@@ -38,6 +37,7 @@ import ( + "sigs.k8s.io/controller-runtime/pkg/manager/signals" + + cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" ++ + "kubevirt.io/containerized-data-importer/pkg/operator/controller" + "kubevirt.io/containerized-data-importer/pkg/util" + ) +@@ -117,11 +117,6 @@ func main() { + os.Exit(1) + } + +- if err := promv1.AddToScheme(mgr.GetScheme()); err != nil { +- log.Error(err, "") +- os.Exit(1) +- } +- + if err := secv1.Install(mgr.GetScheme()); err != nil { + log.Error(err, "") + os.Exit(1) +diff --git a/pkg/operator/controller/callbacks.go b/pkg/operator/controller/callbacks.go +index fe40da5e4..7d97c3b8d 100644 +--- a/pkg/operator/controller/callbacks.go ++++ b/pkg/operator/controller/callbacks.go +@@ -37,10 +37,11 @@ import ( + sdk "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk" + + cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" ++ "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks" ++ + "kubevirt.io/containerized-data-importer/pkg/common" + cdicontroller "kubevirt.io/containerized-data-importer/pkg/controller" + cc "kubevirt.io/containerized-data-importer/pkg/controller/common" +- "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks" + ) + + func addReconcileCallbacks(r *ReconcileCDI) { +@@ -49,7 +50,6 @@ func addReconcileCallbacks(r *ReconcileCDI) { + r.reconciler.AddCallback(&corev1.ServiceAccount{}, reconcileServiceAccounts) + r.reconciler.AddCallback(&corev1.ServiceAccount{}, reconcileSCC) + r.reconciler.AddCallback(&appsv1.Deployment{}, reconcileCreateRoute) +- r.reconciler.AddCallback(&appsv1.Deployment{}, reconcileCreatePrometheusInfra) + r.reconciler.AddCallback(&appsv1.Deployment{}, reconcileRemainingRelationshipLabels) + r.reconciler.AddCallback(&appsv1.Deployment{}, reconcileDeleteSecrets) + r.reconciler.AddCallback(&appsv1.Deployment{}, reconcileCDICRD) +@@ -147,36 +147,6 @@ func reconcileSCC(args *callbacks.ReconcileCallbackArgs) error { + return nil + } + +-func reconcileCreatePrometheusInfra(args *callbacks.ReconcileCallbackArgs) error { +- if args.State != callbacks.ReconcileStatePostRead { +- return nil +- } +- +- deployment := args.CurrentObject.(*appsv1.Deployment) +- // we don't check sdk.CheckDeploymentReady(deployment) since we want Prometheus to cover NotReady state as well +- if !isControllerDeployment(deployment) { +- return nil +- } +- +- cr := args.Resource.(runtime.Object) +- namespace := deployment.GetNamespace() +- if namespace == "" { +- return fmt.Errorf("cluster scoped owner not supported") +- } +- +- if deployed, err := isPrometheusDeployed(args.Logger, args.Client, namespace); err != nil { +- return err +- } else if !deployed { +- return nil +- } +- if err := ensurePrometheusResourcesExist(context.TODO(), args.Client, args.Scheme, deployment); err != nil { +- args.Recorder.Event(cr, corev1.EventTypeWarning, createResourceFailed, fmt.Sprintf("Failed to ensure prometheus resources exists, %v", err)) +- return err +- } +- +- return nil +-} +- + func deleteWorkerResources(l logr.Logger, c client.Client) error { + listTypes := []client.ObjectList{&corev1.PodList{}, &corev1.ServiceList{}} + +diff --git a/pkg/operator/controller/prometheus.go b/pkg/operator/controller/prometheus.go +deleted file mode 100644 +index 6f41df60d..000000000 +--- a/pkg/operator/controller/prometheus.go ++++ /dev/null +@@ -1,454 +0,0 @@ +-/* +-Copyright 2018 The CDI Authors. +- +-Licensed under the Apache License, Version 2.0 (the "License"); +-you may not use this file except in compliance with the License. +-You may obtain a copy of the License at +- +- http://www.apache.org/licenses/LICENSE-2.0 +- +-Unless required by applicable law or agreed to in writing, software +-distributed under the License is distributed on an "AS IS" BASIS, +-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-See the License for the specific language governing permissions and +-limitations under the License. +-*/ +- +-package controller +- +-import ( +- "context" +- "errors" +- "fmt" +- "os" +- "reflect" +- "strings" +- +- "github.com/go-logr/logr" +- promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +- rbacv1 "k8s.io/api/rbac/v1" +- k8serrors "k8s.io/apimachinery/pkg/api/errors" +- "k8s.io/apimachinery/pkg/api/meta" +- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +- "k8s.io/apimachinery/pkg/runtime" +- "k8s.io/apimachinery/pkg/util/intstr" +- "sigs.k8s.io/controller-runtime/pkg/client" +- "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +- "sigs.k8s.io/controller-runtime/pkg/source" +- +- "kubevirt.io/containerized-data-importer/pkg/common" +- cc "kubevirt.io/containerized-data-importer/pkg/controller/common" +- "kubevirt.io/containerized-data-importer/pkg/monitoring" +- cdinamespaced "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced" +- "kubevirt.io/containerized-data-importer/pkg/util" +- +- sdk "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk" +-) +- +-const ( +- ruleName = "prometheus-cdi-rules" +- rbacName = "cdi-monitoring" +- monitorName = "service-monitor-cdi" +- defaultMonitoringNs = "monitoring" +- defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" +- runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" +- severityAlertLabelKey = "severity" +- healthImpactAlertLabelKey = "operator_health_impact" +- partOfAlertLabelKey = "kubernetes_operator_part_of" +- partOfAlertLabelValue = "kubevirt" +- componentAlertLabelKey = "kubernetes_operator_component" +- componentAlertLabelValue = common.CDILabelValue +-) +- +-func ensurePrometheusResourcesExist(ctx context.Context, c client.Client, scheme *runtime.Scheme, owner metav1.Object) error { +- namespace := owner.GetNamespace() +- +- cr, err := cc.GetActiveCDI(ctx, c) +- if err != nil { +- return err +- } +- if cr == nil { +- return fmt.Errorf("no active CDI") +- } +- installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr) +- +- prometheusResources := []client.Object{ +- newPrometheusRule(namespace), +- newPrometheusServiceMonitor(namespace), +- newPrometheusRole(namespace), +- newPrometheusRoleBinding(namespace), +- } +- +- for _, desired := range prometheusResources { +- if err := sdk.SetLastAppliedConfiguration(desired, LastAppliedConfigAnnotation); err != nil { +- return err +- } +- util.SetRecommendedLabels(desired, installerLabels, "cdi-operator") +- if err := controllerutil.SetControllerReference(owner, desired, scheme); err != nil { +- return err +- } +- +- if err := c.Create(ctx, desired); err != nil { +- if k8serrors.IsAlreadyExists(err) { +- current := sdk.NewDefaultInstance(desired) +- nn := client.ObjectKeyFromObject(desired) +- if err := c.Get(ctx, nn, current); err != nil { +- return err +- } +- current, err = sdk.StripStatusFromObject(current) +- if err != nil { +- return err +- } +- currentObjCopy := current.DeepCopyObject() +- sdk.MergeLabelsAndAnnotations(desired, current) +- merged, err := sdk.MergeObject(desired, current, LastAppliedConfigAnnotation) +- if err != nil { +- return err +- } +- if !reflect.DeepEqual(currentObjCopy, merged) { +- if err := c.Update(ctx, merged); err != nil { +- return err +- } +- } +- } else { +- return err +- } +- } +- } +- +- return nil +-} +- +-func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string) (bool, error) { +- rule := &promv1.PrometheusRule{} +- key := client.ObjectKey{Namespace: namespace, Name: ruleName} +- if err := c.Get(context.TODO(), key, rule); err != nil { +- if meta.IsNoMatchError(err) { +- logger.V(3).Info("No match error for PrometheusRule, must not have prometheus deployed") +- return false, nil +- } else if !k8serrors.IsNotFound(err) { +- return false, err +- } +- } +- +- return true, nil +-} +- +-func getRecordRules(namespace string) []promv1.Rule { +- var recordRules []promv1.Rule +- +- for _, rrd := range monitoring.GetRecordRulesDesc(namespace) { +- recordRules = append(recordRules, generateRecordRule(rrd.Opts.Name, rrd.Expr)) +- } +- +- return recordRules +-} +- +-func getAlertRules(runbookURLTemplate string) []promv1.Rule { +- return []promv1.Rule{ +- generateAlertRule( +- "CDIOperatorDown", +- "kubevirt_cdi_operator_up == 0", +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "CDI operator is down", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIOperatorDown"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "critical", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDINotReady", +- "kubevirt_cdi_cr_ready == 0", +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "CDI is not available to use", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINotReady"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "critical", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDIDataVolumeUnusualRestartCount", +- `kubevirt_cdi_import_pods_high_restart > 0 or +- kubevirt_cdi_upload_pods_high_restart > 0 or +- kubevirt_cdi_clone_pods_high_restart > 0`, +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "warning", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDIStorageProfilesIncomplete", +- `sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`, +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIStorageProfilesIncomplete"), +- }, +- map[string]string{ +- severityAlertLabelKey: "info", +- healthImpactAlertLabelKey: "warning", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDIDataImportCronOutdated", +- `sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0`, +- promv1.Duration("15m"), +- map[string]string{ +- "summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataImportCronOutdated"), +- }, +- map[string]string{ +- severityAlertLabelKey: "info", +- healthImpactAlertLabelKey: "warning", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDINoDefaultStorageClass", +- `sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) + +- sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) + +- (count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`, +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINoDefaultStorageClass"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "none", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDIMultipleDefaultVirtStorageClasses", +- `sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`, +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "More than one default virtualization StorageClass detected", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIMultipleDefaultVirtStorageClasses"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "none", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- generateAlertRule( +- "CDIDefaultStorageClassDegraded", +- `sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) + +- sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`, +- promv1.Duration("5m"), +- map[string]string{ +- "summary": "Default storage class has no smart clone or ReadWriteMany", +- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDefaultStorageClassDegraded"), +- }, +- map[string]string{ +- severityAlertLabelKey: "warning", +- healthImpactAlertLabelKey: "none", +- partOfAlertLabelKey: partOfAlertLabelValue, +- componentAlertLabelKey: componentAlertLabelValue, +- }, +- ), +- } +-} +- +-func newPrometheusRule(namespace string) *promv1.PrometheusRule { +- runbookURLTemplate := getRunbookURLTemplate() +- +- return &promv1.PrometheusRule{ +- ObjectMeta: metav1.ObjectMeta{ +- Name: ruleName, +- Namespace: namespace, +- Labels: map[string]string{ +- common.CDIComponentLabel: "", +- common.PrometheusLabelKey: common.PrometheusLabelValue, +- }, +- }, +- Spec: promv1.PrometheusRuleSpec{ +- Groups: []promv1.RuleGroup{ +- { +- Name: "cdi.rules", +- Rules: append(getRecordRules(namespace), getAlertRules(runbookURLTemplate)...), +- }, +- }, +- }, +- } +-} +- +-func newPrometheusRole(namespace string) *rbacv1.Role { +- return &rbacv1.Role{ +- ObjectMeta: metav1.ObjectMeta{ +- Name: rbacName, +- Namespace: namespace, +- Labels: map[string]string{ +- common.CDIComponentLabel: "", +- common.PrometheusLabelKey: common.PrometheusLabelValue, +- }, +- }, +- Rules: cdinamespaced.GetPrometheusNamespacedRules(), +- } +-} +- +-func newPrometheusRoleBinding(namespace string) *rbacv1.RoleBinding { +- monitoringNamespace := getMonitoringNamespace() +- +- return &rbacv1.RoleBinding{ +- ObjectMeta: metav1.ObjectMeta{ +- Name: rbacName, +- Namespace: namespace, +- Labels: map[string]string{ +- common.CDIComponentLabel: "", +- common.PrometheusLabelKey: common.PrometheusLabelValue, +- }, +- }, +- RoleRef: rbacv1.RoleRef{ +- APIGroup: "rbac.authorization.k8s.io", +- Kind: "Role", +- Name: rbacName, +- }, +- Subjects: []rbacv1.Subject{ +- { +- Kind: "ServiceAccount", +- Namespace: monitoringNamespace, +- Name: "prometheus-k8s", +- }, +- }, +- } +-} +- +-func getMonitoringNamespace() string { +- if ns := os.Getenv("MONITORING_NAMESPACE"); ns != "" { +- return ns +- } +- +- return defaultMonitoringNs +-} +- +-func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor { +- return &promv1.ServiceMonitor{ +- ObjectMeta: metav1.ObjectMeta{ +- Namespace: namespace, +- Name: monitorName, +- Labels: map[string]string{ +- common.CDIComponentLabel: "", +- "openshift.io/cluster-monitoring": "", +- common.PrometheusLabelKey: common.PrometheusLabelValue, +- }, +- }, +- Spec: promv1.ServiceMonitorSpec{ +- Selector: metav1.LabelSelector{ +- MatchLabels: map[string]string{ +- common.PrometheusLabelKey: common.PrometheusLabelValue, +- }, +- }, +- NamespaceSelector: promv1.NamespaceSelector{ +- MatchNames: []string{namespace}, +- }, +- Endpoints: []promv1.Endpoint{ +- { +- Port: "metrics", +- Scheme: "http", +- TLSConfig: &promv1.TLSConfig{ +- SafeTLSConfig: promv1.SafeTLSConfig{ +- InsecureSkipVerify: true, +- }, +- }, +- }, +- }, +- }, +- } +-} +- +-func generateAlertRule(alert, expr string, duration promv1.Duration, annotations, labels map[string]string) promv1.Rule { +- return promv1.Rule{ +- Alert: alert, +- Expr: intstr.FromString(expr), +- For: &duration, +- Annotations: annotations, +- Labels: labels, +- } +-} +- +-func generateRecordRule(record, expr string) promv1.Rule { +- return promv1.Rule{ +- Record: record, +- Expr: intstr.FromString(expr), +- } +-} +- +-func (r *ReconcileCDI) watchPrometheusResources() error { +- listObjs := []client.ObjectList{ +- &promv1.PrometheusRuleList{}, +- &promv1.ServiceMonitorList{}, +- } +- +- objs := []client.Object{ +- &promv1.PrometheusRule{}, +- &promv1.ServiceMonitor{}, +- } +- +- for i, listObj := range listObjs { +- obj := objs[i] +- err := r.uncachedClient.List(context.TODO(), listObj, &client.ListOptions{ +- Namespace: util.GetNamespace(), +- Limit: 1, +- }) +- if err == nil { +- if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil { +- return err +- } +- } else if meta.IsNoMatchError(err) { +- log.Info("Not watching", "type", fmt.Sprintf("%T", obj)) +- } else { +- return err +- } +- } +- +- objs = []client.Object{ +- &rbacv1.Role{}, +- &rbacv1.RoleBinding{}, +- } +- +- for _, obj := range objs { +- if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil { +- return err +- } +- } +- +- return nil +-} +- +-func getRunbookURLTemplate() string { +- runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) +- if !exists { +- runbookURLTemplate = defaultRunbookURLTemplate +- } +- +- if strings.Count(runbookURLTemplate, "%s") != 1 { +- panic(errors.New("runbook URL template must have exactly 1 %s substring")) +- } +- +- return runbookURLTemplate +-} +diff --git a/pkg/operator/controller/reconciler-hooks.go b/pkg/operator/controller/reconciler-hooks.go +index 6aafe14d6..fc55bff35 100644 +--- a/pkg/operator/controller/reconciler-hooks.go ++++ b/pkg/operator/controller/reconciler-hooks.go +@@ -26,10 +26,6 @@ func (r *ReconcileCDI) watch() error { + return err + } + +- if err := r.watchPrometheusResources(); err != nil { +- return err +- } +- + if err := r.watchRoutes(); err != nil { + return err + } diff --git a/images/cdi-artifact/patches/README.md b/images/cdi-artifact/patches/README.md index 64acd08e3..81c66bd40 100644 --- a/images/cdi-artifact/patches/README.md +++ b/images/cdi-artifact/patches/README.md @@ -34,3 +34,7 @@ Do not manage DataVolume CRD with cdi-operator. Module will install this CRD usi #### `011-change-storage-class-for-scratch-pvc.patch` Set the storage class name for the scratch pvc from the original pvc that will own the scratch pvc, or set it to an empty value if not available. + +#### `014-delete-service-monitor.patch` + +Removed the creation of a service monitor from the cdi-operator. From 250aa10c2af7417f05a93dc3f97104d588b13701 Mon Sep 17 00:00:00 2001 From: yaroslavborbat Date: Wed, 4 Sep 2024 12:16:59 +0300 Subject: [PATCH 2/3] fix Signed-off-by: yaroslavborbat --- .../patches/014-delete-service-monitor.patch | 51 +++---------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/images/cdi-artifact/patches/014-delete-service-monitor.patch b/images/cdi-artifact/patches/014-delete-service-monitor.patch index 95368e703..86d08c239 100644 --- a/images/cdi-artifact/patches/014-delete-service-monitor.patch +++ b/images/cdi-artifact/patches/014-delete-service-monitor.patch @@ -94,10 +94,10 @@ index fe40da5e4..7d97c3b8d 100644 diff --git a/pkg/operator/controller/prometheus.go b/pkg/operator/controller/prometheus.go deleted file mode 100644 -index 6f41df60d..000000000 +index b5a8be322..000000000 --- a/pkg/operator/controller/prometheus.go +++ /dev/null -@@ -1,454 +0,0 @@ +@@ -1,419 +0,0 @@ -/* -Copyright 2018 The CDI Authors. - @@ -278,9 +278,7 @@ index 6f41df60d..000000000 - ), - generateAlertRule( - "CDIDataVolumeUnusualRestartCount", -- `kubevirt_cdi_import_pods_high_restart > 0 or -- kubevirt_cdi_upload_pods_high_restart > 0 or -- kubevirt_cdi_clone_pods_high_restart > 0`, +- "kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0", - promv1.Duration("5m"), - map[string]string{ - "summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated", @@ -295,10 +293,10 @@ index 6f41df60d..000000000 - ), - generateAlertRule( - "CDIStorageProfilesIncomplete", -- `sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`, +- "kubevirt_cdi_incomplete_storageprofiles > 0", - promv1.Duration("5m"), - map[string]string{ -- "summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request", +- "summary": "Incomplete StorageProfiles exist, accessMode/volumeMode cannot be inferred by CDI for PVC population request", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIStorageProfilesIncomplete"), - }, - map[string]string{ @@ -310,7 +308,7 @@ index 6f41df60d..000000000 - ), - generateAlertRule( - "CDIDataImportCronOutdated", -- `sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0`, +- "kubevirt_cdi_dataimportcron_outdated_aggregated > 0", - promv1.Duration("15m"), - map[string]string{ - "summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule", @@ -324,25 +322,8 @@ index 6f41df60d..000000000 - }, - ), - generateAlertRule( -- "CDINoDefaultStorageClass", -- `sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) + -- sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) + -- (count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`, -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINoDefaultStorageClass"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "none", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( - "CDIMultipleDefaultVirtStorageClasses", -- `sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`, +- "kubevirt_cdi_default_virt_storageclasses > 1", - promv1.Duration("5m"), - map[string]string{ - "summary": "More than one default virtualization StorageClass detected", @@ -350,23 +331,7 @@ index 6f41df60d..000000000 - }, - map[string]string{ - severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "none", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDIDefaultStorageClassDegraded", -- `sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) + -- sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`, -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "Default storage class has no smart clone or ReadWriteMany", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDefaultStorageClassDegraded"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "none", +- healthImpactAlertLabelKey: "warning", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, From 407e46dcbe753edc92ee0cae217de6f1765d8eb7 Mon Sep 17 00:00:00 2001 From: yaroslavborbat Date: Thu, 5 Sep 2024 10:01:36 +0300 Subject: [PATCH 3/3] fix Signed-off-by: yaroslavborbat --- .../patches/014-delete-service-monitor.patch | 425 ------------------ 1 file changed, 425 deletions(-) diff --git a/images/cdi-artifact/patches/014-delete-service-monitor.patch b/images/cdi-artifact/patches/014-delete-service-monitor.patch index 86d08c239..ab193a28e 100644 --- a/images/cdi-artifact/patches/014-delete-service-monitor.patch +++ b/images/cdi-artifact/patches/014-delete-service-monitor.patch @@ -92,431 +92,6 @@ index fe40da5e4..7d97c3b8d 100644 func deleteWorkerResources(l logr.Logger, c client.Client) error { listTypes := []client.ObjectList{&corev1.PodList{}, &corev1.ServiceList{}} -diff --git a/pkg/operator/controller/prometheus.go b/pkg/operator/controller/prometheus.go -deleted file mode 100644 -index b5a8be322..000000000 ---- a/pkg/operator/controller/prometheus.go -+++ /dev/null -@@ -1,419 +0,0 @@ --/* --Copyright 2018 The CDI Authors. -- --Licensed under the Apache License, Version 2.0 (the "License"); --you may not use this file except in compliance with the License. --You may obtain a copy of the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- --Unless required by applicable law or agreed to in writing, software --distributed under the License is distributed on an "AS IS" BASIS, --WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --See the License for the specific language governing permissions and --limitations under the License. --*/ -- --package controller -- --import ( -- "context" -- "errors" -- "fmt" -- "os" -- "reflect" -- "strings" -- -- "github.com/go-logr/logr" -- promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" -- rbacv1 "k8s.io/api/rbac/v1" -- k8serrors "k8s.io/apimachinery/pkg/api/errors" -- "k8s.io/apimachinery/pkg/api/meta" -- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -- "k8s.io/apimachinery/pkg/runtime" -- "k8s.io/apimachinery/pkg/util/intstr" -- "sigs.k8s.io/controller-runtime/pkg/client" -- "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" -- "sigs.k8s.io/controller-runtime/pkg/source" -- -- "kubevirt.io/containerized-data-importer/pkg/common" -- cc "kubevirt.io/containerized-data-importer/pkg/controller/common" -- "kubevirt.io/containerized-data-importer/pkg/monitoring" -- cdinamespaced "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced" -- "kubevirt.io/containerized-data-importer/pkg/util" -- -- sdk "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk" --) -- --const ( -- ruleName = "prometheus-cdi-rules" -- rbacName = "cdi-monitoring" -- monitorName = "service-monitor-cdi" -- defaultMonitoringNs = "monitoring" -- defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" -- runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" -- severityAlertLabelKey = "severity" -- healthImpactAlertLabelKey = "operator_health_impact" -- partOfAlertLabelKey = "kubernetes_operator_part_of" -- partOfAlertLabelValue = "kubevirt" -- componentAlertLabelKey = "kubernetes_operator_component" -- componentAlertLabelValue = common.CDILabelValue --) -- --func ensurePrometheusResourcesExist(ctx context.Context, c client.Client, scheme *runtime.Scheme, owner metav1.Object) error { -- namespace := owner.GetNamespace() -- -- cr, err := cc.GetActiveCDI(ctx, c) -- if err != nil { -- return err -- } -- if cr == nil { -- return fmt.Errorf("no active CDI") -- } -- installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr) -- -- prometheusResources := []client.Object{ -- newPrometheusRule(namespace), -- newPrometheusServiceMonitor(namespace), -- newPrometheusRole(namespace), -- newPrometheusRoleBinding(namespace), -- } -- -- for _, desired := range prometheusResources { -- if err := sdk.SetLastAppliedConfiguration(desired, LastAppliedConfigAnnotation); err != nil { -- return err -- } -- util.SetRecommendedLabels(desired, installerLabels, "cdi-operator") -- if err := controllerutil.SetControllerReference(owner, desired, scheme); err != nil { -- return err -- } -- -- if err := c.Create(ctx, desired); err != nil { -- if k8serrors.IsAlreadyExists(err) { -- current := sdk.NewDefaultInstance(desired) -- nn := client.ObjectKeyFromObject(desired) -- if err := c.Get(ctx, nn, current); err != nil { -- return err -- } -- current, err = sdk.StripStatusFromObject(current) -- if err != nil { -- return err -- } -- currentObjCopy := current.DeepCopyObject() -- sdk.MergeLabelsAndAnnotations(desired, current) -- merged, err := sdk.MergeObject(desired, current, LastAppliedConfigAnnotation) -- if err != nil { -- return err -- } -- if !reflect.DeepEqual(currentObjCopy, merged) { -- if err := c.Update(ctx, merged); err != nil { -- return err -- } -- } -- } else { -- return err -- } -- } -- } -- -- return nil --} -- --func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string) (bool, error) { -- rule := &promv1.PrometheusRule{} -- key := client.ObjectKey{Namespace: namespace, Name: ruleName} -- if err := c.Get(context.TODO(), key, rule); err != nil { -- if meta.IsNoMatchError(err) { -- logger.V(3).Info("No match error for PrometheusRule, must not have prometheus deployed") -- return false, nil -- } else if !k8serrors.IsNotFound(err) { -- return false, err -- } -- } -- -- return true, nil --} -- --func getRecordRules(namespace string) []promv1.Rule { -- var recordRules []promv1.Rule -- -- for _, rrd := range monitoring.GetRecordRulesDesc(namespace) { -- recordRules = append(recordRules, generateRecordRule(rrd.Opts.Name, rrd.Expr)) -- } -- -- return recordRules --} -- --func getAlertRules(runbookURLTemplate string) []promv1.Rule { -- return []promv1.Rule{ -- generateAlertRule( -- "CDIOperatorDown", -- "kubevirt_cdi_operator_up == 0", -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "CDI operator is down", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIOperatorDown"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "critical", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDINotReady", -- "kubevirt_cdi_cr_ready == 0", -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "CDI is not available to use", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINotReady"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "critical", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDIDataVolumeUnusualRestartCount", -- "kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0", -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "warning", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDIStorageProfilesIncomplete", -- "kubevirt_cdi_incomplete_storageprofiles > 0", -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "Incomplete StorageProfiles exist, accessMode/volumeMode cannot be inferred by CDI for PVC population request", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIStorageProfilesIncomplete"), -- }, -- map[string]string{ -- severityAlertLabelKey: "info", -- healthImpactAlertLabelKey: "warning", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDIDataImportCronOutdated", -- "kubevirt_cdi_dataimportcron_outdated_aggregated > 0", -- promv1.Duration("15m"), -- map[string]string{ -- "summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataImportCronOutdated"), -- }, -- map[string]string{ -- severityAlertLabelKey: "info", -- healthImpactAlertLabelKey: "warning", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- generateAlertRule( -- "CDIMultipleDefaultVirtStorageClasses", -- "kubevirt_cdi_default_virt_storageclasses > 1", -- promv1.Duration("5m"), -- map[string]string{ -- "summary": "More than one default virtualization StorageClass detected", -- "runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIMultipleDefaultVirtStorageClasses"), -- }, -- map[string]string{ -- severityAlertLabelKey: "warning", -- healthImpactAlertLabelKey: "warning", -- partOfAlertLabelKey: partOfAlertLabelValue, -- componentAlertLabelKey: componentAlertLabelValue, -- }, -- ), -- } --} -- --func newPrometheusRule(namespace string) *promv1.PrometheusRule { -- runbookURLTemplate := getRunbookURLTemplate() -- -- return &promv1.PrometheusRule{ -- ObjectMeta: metav1.ObjectMeta{ -- Name: ruleName, -- Namespace: namespace, -- Labels: map[string]string{ -- common.CDIComponentLabel: "", -- common.PrometheusLabelKey: common.PrometheusLabelValue, -- }, -- }, -- Spec: promv1.PrometheusRuleSpec{ -- Groups: []promv1.RuleGroup{ -- { -- Name: "cdi.rules", -- Rules: append(getRecordRules(namespace), getAlertRules(runbookURLTemplate)...), -- }, -- }, -- }, -- } --} -- --func newPrometheusRole(namespace string) *rbacv1.Role { -- return &rbacv1.Role{ -- ObjectMeta: metav1.ObjectMeta{ -- Name: rbacName, -- Namespace: namespace, -- Labels: map[string]string{ -- common.CDIComponentLabel: "", -- common.PrometheusLabelKey: common.PrometheusLabelValue, -- }, -- }, -- Rules: cdinamespaced.GetPrometheusNamespacedRules(), -- } --} -- --func newPrometheusRoleBinding(namespace string) *rbacv1.RoleBinding { -- monitoringNamespace := getMonitoringNamespace() -- -- return &rbacv1.RoleBinding{ -- ObjectMeta: metav1.ObjectMeta{ -- Name: rbacName, -- Namespace: namespace, -- Labels: map[string]string{ -- common.CDIComponentLabel: "", -- common.PrometheusLabelKey: common.PrometheusLabelValue, -- }, -- }, -- RoleRef: rbacv1.RoleRef{ -- APIGroup: "rbac.authorization.k8s.io", -- Kind: "Role", -- Name: rbacName, -- }, -- Subjects: []rbacv1.Subject{ -- { -- Kind: "ServiceAccount", -- Namespace: monitoringNamespace, -- Name: "prometheus-k8s", -- }, -- }, -- } --} -- --func getMonitoringNamespace() string { -- if ns := os.Getenv("MONITORING_NAMESPACE"); ns != "" { -- return ns -- } -- -- return defaultMonitoringNs --} -- --func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor { -- return &promv1.ServiceMonitor{ -- ObjectMeta: metav1.ObjectMeta{ -- Namespace: namespace, -- Name: monitorName, -- Labels: map[string]string{ -- common.CDIComponentLabel: "", -- "openshift.io/cluster-monitoring": "", -- common.PrometheusLabelKey: common.PrometheusLabelValue, -- }, -- }, -- Spec: promv1.ServiceMonitorSpec{ -- Selector: metav1.LabelSelector{ -- MatchLabels: map[string]string{ -- common.PrometheusLabelKey: common.PrometheusLabelValue, -- }, -- }, -- NamespaceSelector: promv1.NamespaceSelector{ -- MatchNames: []string{namespace}, -- }, -- Endpoints: []promv1.Endpoint{ -- { -- Port: "metrics", -- Scheme: "http", -- TLSConfig: &promv1.TLSConfig{ -- SafeTLSConfig: promv1.SafeTLSConfig{ -- InsecureSkipVerify: true, -- }, -- }, -- }, -- }, -- }, -- } --} -- --func generateAlertRule(alert, expr string, duration promv1.Duration, annotations, labels map[string]string) promv1.Rule { -- return promv1.Rule{ -- Alert: alert, -- Expr: intstr.FromString(expr), -- For: &duration, -- Annotations: annotations, -- Labels: labels, -- } --} -- --func generateRecordRule(record, expr string) promv1.Rule { -- return promv1.Rule{ -- Record: record, -- Expr: intstr.FromString(expr), -- } --} -- --func (r *ReconcileCDI) watchPrometheusResources() error { -- listObjs := []client.ObjectList{ -- &promv1.PrometheusRuleList{}, -- &promv1.ServiceMonitorList{}, -- } -- -- objs := []client.Object{ -- &promv1.PrometheusRule{}, -- &promv1.ServiceMonitor{}, -- } -- -- for i, listObj := range listObjs { -- obj := objs[i] -- err := r.uncachedClient.List(context.TODO(), listObj, &client.ListOptions{ -- Namespace: util.GetNamespace(), -- Limit: 1, -- }) -- if err == nil { -- if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil { -- return err -- } -- } else if meta.IsNoMatchError(err) { -- log.Info("Not watching", "type", fmt.Sprintf("%T", obj)) -- } else { -- return err -- } -- } -- -- objs = []client.Object{ -- &rbacv1.Role{}, -- &rbacv1.RoleBinding{}, -- } -- -- for _, obj := range objs { -- if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil { -- return err -- } -- } -- -- return nil --} -- --func getRunbookURLTemplate() string { -- runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) -- if !exists { -- runbookURLTemplate = defaultRunbookURLTemplate -- } -- -- if strings.Count(runbookURLTemplate, "%s") != 1 { -- panic(errors.New("runbook URL template must have exactly 1 %s substring")) -- } -- -- return runbookURLTemplate --} diff --git a/pkg/operator/controller/reconciler-hooks.go b/pkg/operator/controller/reconciler-hooks.go index 6aafe14d6..fc55bff35 100644 --- a/pkg/operator/controller/reconciler-hooks.go