Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hive_cluster_deployment_provision_underway_install_restarts metric #1275

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ func Add(mgr manager.Manager) error {
Client: mgr.GetClient(),
Interval: 2 * time.Minute,
}
metrics.Registry.MustRegister(newProvisioningUnderwayCollector(mgr.GetClient()))
metrics.Registry.MustRegister(newProvisioningUnderwaySecondsCollector(mgr.GetClient(), 1*time.Hour))
metrics.Registry.MustRegister(newProvisioningUnderwayInstallRestartsCollector(mgr.GetClient(), 1))
err := mgr.Add(mc)
if err != nil {
return err
Expand Down
148 changes: 129 additions & 19 deletions pkg/controller/metrics/provision_underway_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ var (
type provisioningUnderwayCollector struct {
client client.Client

// minDuration, when non-zero, is the minimum duration afer which clusters provisioning
// will start becomming part of this metric. When set to zero, all clusters provisioning
// will be included in the metric.
minDuration time.Duration

// metricClusterDeploymentProvisionUnderwaySeconds is a prometheus metric for the number of seconds
// between when a still provisioning cluster was created and now.
metricClusterDeploymentProvisionUnderwaySeconds *prometheus.Desc
Expand All @@ -56,29 +61,24 @@ func (cc provisioningUnderwayCollector) Collect(ch chan<- prometheus.Metric) {
}

// Add install failure details for stuck provision
condition, reason := "Unknown", "Unknown"
for _, delayCondition := range provisioningDelayCondition {
if cdCondition := controllerutils.FindClusterDeploymentCondition(cd.Status.Conditions,
delayCondition); cdCondition != nil {
if cdCondition.Status == corev1.ConditionTrue && cdCondition.Reason != "" {
condition = string(delayCondition)
reason = cdCondition.Reason
}
break
}
}
condition, reason := getKnownConditions(cd.Status.Conditions)

platform := cd.Labels[hivev1.HiveClusterPlatformLabel]
imageSet := "none"
if cd.Spec.Provisioning != nil && cd.Spec.Provisioning.ImageSetRef != nil {
imageSet = cd.Spec.Provisioning.ImageSetRef.Name
}

elapsedDuration := time.Since(cd.CreationTimestamp.Time)
if cc.minDuration.Seconds() > 0 && elapsedDuration < cc.minDuration {
continue // skip reporting the metric for clusterdeployment until the elapsed time is at least minDuration
}

// For installing clusters we report the seconds since the cluster was created.
ch <- prometheus.MustNewConstMetric(
cc.metricClusterDeploymentProvisionUnderwaySeconds,
prometheus.GaugeValue,
time.Since(cd.CreationTimestamp.Time).Seconds(),
elapsedDuration.Seconds(),
cd.Name,
cd.Namespace,
GetClusterDeploymentType(&cd),
Expand All @@ -96,14 +96,124 @@ func (cc provisioningUnderwayCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(cc, ch)
}

func newProvisioningUnderwayCollector(client client.Client) prometheus.Collector {
var (
metricClusterDeploymentProvisionUnderwaySecondsDesc = prometheus.NewDesc(
"hive_cluster_deployment_provision_underway_seconds",
"Length of time a cluster has been provisioning.",
[]string{"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"},
nil,
)
)

func newProvisioningUnderwaySecondsCollector(client client.Client, minimum time.Duration) prometheus.Collector {
return provisioningUnderwayCollector{
client: client,
metricClusterDeploymentProvisionUnderwaySeconds: prometheus.NewDesc(
"hive_cluster_deployment_provision_underway_seconds",
"Length of time a cluster has been provisioning.",
[]string{"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"},
nil,
),
metricClusterDeploymentProvisionUnderwaySeconds: metricClusterDeploymentProvisionUnderwaySecondsDesc,
minDuration: minimum,
}
}

// provisioning underway install restarts metrics collected through a custom prometheus collector
type provisioningUnderwayInstallRestartsCollector struct {
client client.Client

// minRestarts, when non-zero, is the minimum restarts after which clusters provisioning
// will start becoming part of the metric. When set to zero, all clusters provisioning
// will be included in the metric.
minRestarts int

// metricClusterDeploymentProvisionUnderwayInstallRestarts is a prometheus metric for the number of install
// restarts for a still provisioning cluster.
metricClusterDeploymentProvisionUnderwayInstallRestarts *prometheus.Desc
}

// collects the metrics for provisioningUnderwayInstallRestartsCollector
func (cc provisioningUnderwayInstallRestartsCollector) Collect(ch chan<- prometheus.Metric) {
ccLog := log.WithField("controller", "metrics")
ccLog.Info("calculating provisioning underway install restarts metrics across all ClusterDeployments")

// Load all ClusterDeployments so we can accumulate facts about them.
clusterDeployments := &hivev1.ClusterDeploymentList{}
err := cc.client.List(context.Background(), clusterDeployments)
if err != nil {
log.WithError(err).Error("error listing cluster deployments")
return
}
for _, cd := range clusterDeployments.Items {
if cd.DeletionTimestamp != nil {
continue
}
if cd.Spec.Installed {
continue
}

// Add install failure details for stuck provision
condition, reason := getKnownConditions(cd.Status.Conditions)

platform := cd.Labels[hivev1.HiveClusterPlatformLabel]
imageSet := "none"
if cd.Spec.Provisioning != nil && cd.Spec.Provisioning.ImageSetRef != nil {
imageSet = cd.Spec.Provisioning.ImageSetRef.Name
}

restarts := cd.Status.InstallRestarts
if restarts == 0 {
continue // skip reporting the metric for clusterdeployment that hasn't restarted at all
}
if cc.minRestarts > 0 && restarts < cc.minRestarts {
continue // skip reporting the metric for clusterdeployment until the InstallRestarts is at least minRestarts
}

// For installing clusters we report the seconds since the cluster was created.
ch <- prometheus.MustNewConstMetric(
cc.metricClusterDeploymentProvisionUnderwayInstallRestarts,
prometheus.GaugeValue,
float64(restarts),
cd.Name,
cd.Namespace,
GetClusterDeploymentType(&cd),
condition,
reason,
platform,
imageSet,
)

}

}

func (cc provisioningUnderwayInstallRestartsCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(cc, ch)
}

var (
provisioningUnderwayInstallRestartsCollectorDesc = prometheus.NewDesc(
"hive_cluster_deployment_provision_underway_install_restarts",
"Number install restarts for a cluster that has been provisioning.",
[]string{"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"},
nil,
)
)

func newProvisioningUnderwayInstallRestartsCollector(client client.Client, minimum int) prometheus.Collector {
return provisioningUnderwayInstallRestartsCollector{
client: client,
metricClusterDeploymentProvisionUnderwayInstallRestarts: provisioningUnderwayInstallRestartsCollectorDesc,
minRestarts: minimum,
}
}

func getKnownConditions(conditions []hivev1.ClusterDeploymentCondition) (condition, reason string) {
condition, reason = "Unknown", "Unknown"
for _, delayCondition := range provisioningDelayCondition {
if cdCondition := controllerutils.FindClusterDeploymentCondition(conditions,
delayCondition); cdCondition != nil {
if cdCondition.Status == corev1.ConditionTrue && cdCondition.Reason != "" {
condition = string(delayCondition)
reason = cdCondition.Reason
}
break
}
}
return condition, reason
}
Loading