Skip to content
This repository has been archived by the owner on Jan 28, 2022. It is now read-only.

Commit

Permalink
Extended databricks operator to report metrics into Prometheus (#104)
Browse files Browse the repository at this point in the history
* Add initial extra metrics to controllers
* Update ServiceMonitor to allow prometheus to connect
  • Loading branch information
storey247 authored and Azadehkhojandi committed Dec 17, 2019
1 parent bc604f4 commit 49d44fa
Show file tree
Hide file tree
Showing 13 changed files with 424 additions and 53 deletions.
3 changes: 2 additions & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ RUN apt-get update \
&& curl -sSL -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \
&& chmod +x /usr/local/bin/kubectl \
# Install Helm
&& curl -s https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash -
&& curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash \
&& helm repo add stable https://kubernetes-charts.storage.googleapis.com/

# Enable bash completion
RUN apt-get update && apt install -y bash-completion && echo "source /etc/bash_completion" >> "/root/.bashrc"
Expand Down
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
IMG ?= controller:latest
# Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
CRD_OPTIONS ?= "crd:trivialVersions=true"
# Prometheus helm installation name
PROMETHEUS_NAME ?= "prom-azure-databricks-operator"

# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN))
Expand Down Expand Up @@ -154,6 +156,8 @@ endif

kubectl cluster-info

make install-prometheus

@echo "deploying controller to cluster"
make deploy-controller

Expand Down Expand Up @@ -191,6 +195,12 @@ else
@echo "kustomize has been installed"
endif

install-prometheus:
@echo "installing prometheus"
# install prometheus (and set to monitor all namespaces in our kind cluster)
helm install ${PROMETHEUS_NAME} stable/prometheus-operator --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false
@echo "prometheus has been installed"

install-test-dependency:
go get -u github.com/jstemmer/go-junit-report \
&& go get github.com/axw/gocov/gocov \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Few topics are discussed in the [resources.md](https://github.com/microsoft/azur
- Kubernetes on WSL
- Build pipelines
- Dev container
- Operator metrics

## Contributing

Expand Down
4 changes: 4 additions & 0 deletions config/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,9 @@ spec:
endpoints:
- path: /metrics
port: https
scheme: https
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
insecureSkipVerify: true # Configure certs here if set up for auth_proxy (uses self-signed currently)
selector:
control-plane: controller-manager
34 changes: 31 additions & 3 deletions controllers/dcluster_controller_databricks.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"reflect"

databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
)

func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error {
Expand All @@ -36,7 +38,7 @@ func (r *DclusterReconciler) submit(instance *databricksv1alpha1.Dcluster) error
}
}

clusterInfo, err := r.APIClient.Clusters().Create(*instance.Spec)
clusterInfo, err := r.createCluster(instance)
if err != nil {
return err
}
Expand All @@ -55,7 +57,7 @@ func (r *DclusterReconciler) refresh(instance *databricksv1alpha1.Dcluster) erro
return nil
}

clusterInfo, err := r.APIClient.Clusters().Get(instance.Status.ClusterInfo.ClusterID)
clusterInfo, err := r.getCluster(instance.Status.ClusterInfo.ClusterID)
if err != nil {
return err
}
Expand All @@ -78,5 +80,31 @@ func (r *DclusterReconciler) delete(instance *databricksv1alpha1.Dcluster) error
return nil
}

return r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
return trackExecutionTime(dclusterDeleteDuration, func() error {
err := r.APIClient.Clusters().PermanentDelete(instance.Status.ClusterInfo.ClusterID)
trackSuccessFailure(err, dclusterCounterVec, "delete")
return err
})
}

func (r *DclusterReconciler) getCluster(clusterID string) (cluster dbmodels.ClusterInfo, err error) {
timer := prometheus.NewTimer(dclusterGetDuration)
defer timer.ObserveDuration()

cluster, err = r.APIClient.Clusters().Get(clusterID)

trackSuccessFailure(err, dclusterCounterVec, "get")

return cluster, err
}

func (r *DclusterReconciler) createCluster(instance *databricksv1alpha1.Dcluster) (cluster dbmodels.ClusterInfo, err error) {
timer := prometheus.NewTimer(dclusterCreateDuration)
defer timer.ObserveDuration()

cluster, err = r.APIClient.Clusters().Create(*instance.Spec)

trackSuccessFailure(err, dclusterCounterVec, "create")

return cluster, err
}
53 changes: 53 additions & 0 deletions controllers/dcluster_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2019 microsoft.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
dclusterCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metricPrefix + "dcluster_total",
Help: "Counter related to the dCluster CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
},
[]string{"status", "method"},
)

dclusterCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_creation_request_duration_seconds",
Help: "Duration of DB api dcluster create calls.",
})

dclusterGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_get_request_duration_seconds",
Help: "Duration of DB api dcluster get calls.",
})

dclusterDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "dcluster_delete_request_duration_seconds",
Help: "Duration of DB api dcluster delete calls.",
})
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(dclusterCounterVec,
dclusterCreateDuration, dclusterGetDuration, dclusterDeleteDuration)
}
46 changes: 39 additions & 7 deletions controllers/djob_controller_databricks.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ package controllers
import (
"context"
"fmt"
"reflect"
"strings"

databricksv1alpha1 "github.com/microsoft/azure-databricks-operator/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
dbmodels "github.com/xinsnake/databricks-sdk-golang/azure/models"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"reflect"
"sigs.k8s.io/controller-runtime/pkg/client"
"strings"
)

func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
Expand Down Expand Up @@ -68,11 +71,12 @@ func (r *DjobReconciler) submit(instance *databricksv1alpha1.Djob) error {
}
instance.ObjectMeta.SetOwnerReferences(references)
}
jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
job, err := r.APIClient.Jobs().Create(jobSettings)
job, err := r.createJob(instance)

if err != nil {
return err
}

instance.Spec.Name = instance.GetName()
instance.Status = &databricksv1alpha1.DjobStatus{
JobStatus: &job,
Expand All @@ -85,7 +89,8 @@ func (r *DjobReconciler) refresh(instance *databricksv1alpha1.Djob) error {

jobID := instance.Status.JobStatus.JobID

job, err := r.APIClient.Jobs().Get(jobID)
job, err := r.getJob(jobID)

if err != nil {
return err
}
Expand Down Expand Up @@ -126,12 +131,39 @@ func (r *DjobReconciler) delete(instance *databricksv1alpha1.Djob) error {
jobID := instance.Status.JobStatus.JobID

// Check if the job exists before trying to delete it
if _, err := r.APIClient.Jobs().Get(jobID); err != nil {
if _, err := r.getJob(jobID); err != nil {
if strings.Contains(err.Error(), "does not exist") {
return nil
}
return err
}

return r.APIClient.Jobs().Delete(jobID)
return trackExecutionTime(djobDeleteDuration, func() error {
err := r.APIClient.Jobs().Delete(jobID)
trackSuccessFailure(err, djobCounterVec, "delete")
return err
})
}

func (r *DjobReconciler) getJob(jobID int64) (job dbmodels.Job, err error) {
timer := prometheus.NewTimer(djobGetDuration)
defer timer.ObserveDuration()

job, err = r.APIClient.Jobs().Get(jobID)

trackSuccessFailure(err, djobCounterVec, "get")

return job, err
}

func (r *DjobReconciler) createJob(instance *databricksv1alpha1.Djob) (job dbmodels.Job, err error) {
timer := prometheus.NewTimer(djobCreateDuration)
defer timer.ObserveDuration()

jobSettings := databricksv1alpha1.ToDatabricksJobSettings(instance.Spec)
job, err = r.APIClient.Jobs().Create(jobSettings)

trackSuccessFailure(err, djobCounterVec, "create")

return job, err
}
53 changes: 53 additions & 0 deletions controllers/djob_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2019 microsoft.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
djobCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metricPrefix + "djob_total",
Help: "Counter related to the dJob CRD partitioned by status and method invoked. Status = success/fail and method indicates REST endpoint",
},
[]string{"status", "method"},
)

djobCreateDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_creation_request_duration_seconds",
Help: "Duration of DB api djob create calls.",
})

djobGetDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_get_request_duration_seconds",
Help: "Duration of DB api djob get calls.",
})

djobDeleteDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: metricPrefix + "djob_delete_request_duration_seconds",
Help: "Duration of DB api djob delete calls.",
})
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(djobCounterVec,
djobCreateDuration, djobGetDuration, djobDeleteDuration)
}
41 changes: 41 additions & 0 deletions controllers/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Copyright 2019 microsoft.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/prometheus/client_golang/prometheus"
)

const (
metricPrefix = "databricks_"
successMetric = "success"
failureMetric = "failure"
)

func trackExecutionTime(histogram prometheus.Histogram, f func() error) error {
timer := prometheus.NewTimer(histogram)
defer timer.ObserveDuration()
return f()
}

func trackSuccessFailure(err error, counterVec *prometheus.CounterVec, method string) {
if err == nil {
counterVec.With(prometheus.Labels{"status": successMetric, "method": method}).Inc()
} else {
counterVec.With(prometheus.Labels{"status": failureMetric, "method": method}).Inc()
}
}
Loading

0 comments on commit 49d44fa

Please sign in to comment.