diff --git a/.gitignore b/.gitignore index 4e999979a2..15bdae29c7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /prometheus /thanos /vendor/ +/mixin/vendor/ # Ignore minikube setup working dirs. kube/bin @@ -25,3 +26,4 @@ website/docs-pre-processed/ !website/data tmp/bin +examples/tmp/ diff --git a/Makefile b/Makefile index 29c7995b25..672d84951e 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,20 @@ GOLANGCILINT ?= $(GOBIN)/golangci-lint-$(GOLANGCILINT_VERSION) MISSPELL_VERSION ?= c0b55c8239520f6b5aa15a0207ca8b28027ba49e MISSPELL ?= $(GOBIN)/misspell-$(MISSPELL_VERSION) +GOJSONTOYAML_VERSION ?= e8bd32d46b3d764bef60f12b3bada1c132c4be55 +GOJSONTOYAML ?= $(GOBIN)/gojsontoyaml-$(GOJSONTOYAML_VERSION) +# v0.14.0 +JSONNET_VERSION ?= fbde25be2182caa4345b03f1532450911ac7d1f3 +JSONNET ?= $(GOBIN)/jsonnet-$(JSONNET_VERSION) +JSONNET_BUNDLER_VERSION ?= d7829f6c7e632e954c0e5db8b3eece8f111f9461 +JSONNET_BUNDLER ?= $(GOBIN)/jb-$(JSONNET_BUNDLER_VERSION) +# Prometheus v2.14.0 +PROMTOOL_VERSION ?= edeb7a44cbf745f1d8be4ea6f215e79e651bfe19 +PROMTOOL ?= $(GOBIN)/promtool-$(PROMTOOL_VERSION) + +MIXIN_ROOT ?= mixin/thanos +JSONNET_VENDOR_DIR ?= mixin/vendor + WEB_DIR ?= website WEBSITE_BASE_URL ?= https://thanos.io PUBLIC_DIR ?= $(WEB_DIR)/public @@ -282,6 +296,79 @@ web-serve: web-pre-process $(HUGO) @echo ">> serving documentation website" @cd $(WEB_DIR) && $(HUGO) --config hugo.yaml -v server +# Check https://github.com/coreos/prometheus-operator/blob/master/scripts/jsonnet/Dockerfile for the image. +JSONNET_CONTAINER_CMD:=docker run --rm \ + -u="$(shell id -u):$(shell id -g)" \ + -v "$(shell go env GOCACHE):/.cache/go-build" \ + -v "$(PWD):/go/src/github.com/thanos-io/thanos:Z" \ + -w "/go/src/github.com/thanos-io/thanos" \ + -e USER=deadbeef \ + -e GO111MODULE=on \ + quay.io/coreos/jsonnet-ci + +.PHONY: examples-in-container +examples-in-container: + @echo ">> Compiling and generating thanos-mixin" + $(JSONNET_CONTAINER_CMD) make $(MFLAGS) JSONNET_BUNDLER='/go/bin/jb' jsonnet-vendor + $(JSONNET_CONTAINER_CMD) make $(MFLAGS) \ + EMBEDMD='/go/bin/embedmd' \ + JSONNET='/go/bin/jsonnet' \ + JSONNET_BUNDLER='/go/bin/jb' \ + PROMTOOL='/go/bin/promtool' \ + GOJSONTOYAML='/go/bin/gojsontoyaml' \ + GOLANGCILINT='/go/bin/golangci-lint' \ + examples + +.PHONY: examples +examples: jsonnet-format mixin/thanos/README.md examples/alerts/alerts.md examples/alerts/alerts.yaml examples/alerts/rules.yaml examples/dashboards examples/tmp + $(EMBEDMD) -w examples/alerts/alerts.md + $(EMBEDMD) -w mixin/thanos/README.md + +.PHONY: examples/tmp +examples/tmp: + -rm -rf examples/tmp/ + -mkdir -p examples/tmp/ + $(JSONNET) -J ${JSONNET_VENDOR_DIR} -m examples/tmp/ ${MIXIN_ROOT}/separated_alerts.jsonnet | xargs -I{} sh -c 'cat {} | $(GOJSONTOYAML) > {}.yaml; rm -f {}' -- {} + +.PHONY: examples/dashboards # to keep examples/dashboards/dashboards.md. +examples/dashboards: $(JSONNET) ${MIXIN_ROOT}/mixin.libsonnet ${MIXIN_ROOT}/defaults.libsonnet ${MIXIN_ROOT}/dashboards/* + -rm -rf examples/dashboards/*.json + $(JSONNET) -J ${JSONNET_VENDOR_DIR} -m examples/dashboards ${MIXIN_ROOT}/dashboards.jsonnet + +examples/alerts/alerts.yaml: $(JSONNET) $(GOJSONTOYAML) ${MIXIN_ROOT}/mixin.libsonnet ${MIXIN_ROOT}/defaults.libsonnet ${MIXIN_ROOT}/alerts/* + $(JSONNET) ${MIXIN_ROOT}/alerts.jsonnet | $(GOJSONTOYAML) > $@ + +examples/alerts/rules.yaml: $(JSONNET) $(GOJSONTOYAML) ${MIXIN_ROOT}/mixin.libsonnet ${MIXIN_ROOT}/defaults.libsonnet ${MIXIN_ROOT}/rules/* + $(JSONNET) ${MIXIN_ROOT}/rules.jsonnet | $(GOJSONTOYAML) > $@ + +.PHONY: jsonnet-vendor +jsonnet-vendor: $(JSONNET_BUNDLER) jsonnetfile.json jsonnetfile.lock.json + rm -rf ${JSONNET_VENDOR_DIR} + $(JSONNET_BUNDLER) install --jsonnetpkg-home="${JSONNET_VENDOR_DIR}" + +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +.PHONY: jsonnet-format +jsonnet-format: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +.PHONY: jsonnet-format-in-container +jsonnet-format-in-container: + $(JSONNET_CONTAINER_CMD) make $(MFLAGS) jsonnet-format + +.PHONY: example-rules-lint +example-rules-lint: $(PROMTOOL) examples/alerts/alerts.yaml examples/alerts/rules.yaml + $(PROMTOOL) check rules examples/alerts/alerts.yaml examples/alerts/rules.yaml + $(PROMTOOL) test rules examples/alerts/tests.yaml + +.PHONY: examples-clean +examples-clean: + rm -f examples/alerts/alerts.yaml + rm -f examples/alerts/rules.yaml + rm -f examples/dashboards/*.json + rm -f examples/tmp/*.yaml + # non-phony targets $(EMBEDMD): $(call fetch_go_bin_version,github.com/campoy/embedmd,$(EMBEDMD_VERSION)) @@ -325,3 +412,15 @@ $(PROTOC): @echo ">> installing protoc@${PROTOC_VERSION}" @mv -- "$(TMP_GOPATH)/bin/protoc" "$(GOBIN)/protoc-$(PROTOC_VERSION)" @echo ">> produced $(GOBIN)/protoc-$(PROTOC_VERSION)" + +$(JSONNET): + $(call fetch_go_bin_version,github.com/google/go-jsonnet/cmd/jsonnet,$(JSONNET_VERSION)) + +$(GOJSONTOYAML): + $(call fetch_go_bin_version,github.com/brancz/gojsontoyaml,$(GOJSONTOYAML_VERSION)) + +$(JSONNET_BUNDLER): + $(call fetch_go_bin_version,github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb,$(JSONNET_BUNDLER_VERSION)) + +$(PROMTOOL): + $(call fetch_go_bin_version,github.com/prometheus/prometheus/cmd/promtool,$(PROMTOOL_VERSION)) diff --git a/docs/getting-started.md b/docs/getting-started.md index f263313067..d1a46672e8 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -8,10 +8,10 @@ slug: /getting-started.md # Getting started -Thanos provides a global query view, high availability, data backup with historical, cheap data access as its core features in a single binary. +Thanos provides a global query view, high availability, data backup with historical, cheap data access as its core features in a single binary. -Those features can be deployed independently of each other. This allows you to have a subset of Thanos features ready -for immediate benefit or testing, while also making it flexible for gradual roll outs in more complex environments. +Those features can be deployed independently of each other. This allows you to have a subset of Thanos features ready +for immediate benefit or testing, while also making it flexible for gradual roll outs in more complex environments. In this quick-start guide, we will explain: @@ -33,7 +33,7 @@ Thanos aims for a simple deployment and maintenance model. The only dependencies You can find the latest Thanos release [here](https://github.com/thanos-io/thanos/releases). -Master should be stable and usable. Every commit to master builds docker image named `master--` in +Master should be stable and usable. Every commit to master builds docker image named `master--` in [quay.io/thanos/thanos](https://quay.io/repository/thanos/thanos) and [thanosio/thanos dockerhub (mirror)](https://hub.docker.com/r/thanosio/thanos) We also perform minor releases every 6 weeks. @@ -44,7 +44,7 @@ See [release process docs](release-process.md) for details. ## Building from source: -Thanos is built purely in [Golang](https://golang.org/), thus allowing to run Thanos on various x64 operating systems. +Thanos is built purely in [Golang](https://golang.org/), thus allowing to run Thanos on various x64 operating systems. If you want to build Thanos from source you would need a working installation of the Go 1.12+ [toolchain](https://github.com/golang/tools) (`GOPATH`, `PATH=${GOPATH}/bin:${PATH}`). @@ -91,8 +91,8 @@ If you want to add yourself to this list, let us know! ## Operating -See up to date [jsonnet mixins](https://github.com/thanos-io/kube-thanos/tree/master/jsonnet/thanos-mixin) -We also have example Grafana dashboards [here](/examples/grafana/monitoring.md) and some [alerts](/examples/alerts/alerts.md) to get you started. +See up to date [jsonnet mixins](https://github.com/thanos-io/thanos/tree/master/jsonnet/thanos-mixin) +We also have example Grafana dashboards [here](/examples/dashboards/dashboards.md) and some [alerts](/examples/alerts/alerts.md) to get you started. ## Talks diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 0a32007ed0..14367defa7 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -4,74 +4,71 @@ Here are some example alerts configured for Kubernetes environment. ## Compaction -``` -- alert: ThanosCompactHalted - expr: thanos_compactor_halted{app="thanos-compact"} == 1 +[embedmd]:# (../tmp/thanos-compactor.rules.yaml yaml) +```yaml +name: thanos-compactor.rules +rules: +- alert: ThanosCompactorMultipleCompactsAreRunning + annotations: + message: You should never run more than one Thanos Compact at once. You have {{ + $value }} + expr: sum(up{job=~"thanos-compactor.*"}) > 1 for: 5m labels: - team: TEAM + severity: warning +- alert: ThanosCompactorHalted annotations: - summary: Thanos compaction has failed to run and now is halted - impact: Long term storage queries will be slower - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL -- alert: ThanosCompactCompactionsFailed - expr: rate(prometheus_tsdb_compactions_failed_total{app="thanos-compact"}[5m]) > 0 - labels: - team: TEAM - annotations: - summary: Thanos Compact is failing compaction - impact: Long term storage queries will be slower - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL -- alert: ThanosCompactBucketOperationsFailed - expr: rate(thanos_objstore_bucket_operation_failures_total{app="thanos-compact"}[5m]) > 0 + message: Thanos Compact {{$labels.job}} has failed to run and now is halted. + expr: thanos_compactor_halted{job=~"thanos-compactor.*"} == 1 + for: 5m labels: - team: TEAM + severity: warning +- alert: ThanosCompactorHighCompactionFailures annotations: - summary: Thanos Compact bucket operations are failing - impact: Long term storage queries will be slower - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL -- alert: ThanosCompactNotRunIn24Hours - expr: (time() - max(thanos_objstore_bucket_last_successful_upload_time{app="thanos-compact"}) ) /60/60 > 24 + message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize + }}% of compactions. + expr: | + ( + sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compactor.*"}[5m])) + / + sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compactor.*"}[5m])) + * 100 > 5 + ) + for: 15m labels: - team: TEAM + severity: warning +- alert: ThanosCompactorBucketHighOperationFailures annotations: - summary: Thanos Compaction has not been run in 24 hours - impact: Long term storage queries will be slower - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL -- alert: ThanosComactionIsNotRunning - expr: up{app="thanos-compact"} == 0 or absent({app="thanos-compact"}) - for: 5m + message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. + expr: | + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compactor.*"}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compactor.*"}[5m])) + * 100 > 5 + ) + for: 15m labels: - team: TEAM + severity: warning +- alert: ThanosCompactorHasNotRun annotations: - summary: Thanos Compaction is not running - impact: Long term storage queries will be slower - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL -- alert: ThanosComactionMultipleCompactionsAreRunning - expr: sum(up{app="thanos-compact"}) > 1 - for: 5m + message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + expr: (time() - max(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compactor.*"})) + / 60 / 60 > 24 labels: - team: TEAM - annotations: - summary: Multiple replicas of Thanos compaction shouldn't be running. - impact: Metrics in long term storage may be corrupted - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: COMPACTION_URL - + severity: warning ``` ## Ruler For Thanos ruler we run some alerts in local Prometheus, to make sure that Thanos Rule is working: -``` +[//]: # "TODO(kakkoyun): Generate rule rules using thanos-mixin." + +```yaml - alert: ThanosRuleIsDown - expr: up{app="thanos-rule"} == 0 or absent(up{app="thanos-rule"}) + expr: up{app="thanos-ruler"} == 0 or absent(up{app="thanos-ruler"}) for: 5m labels: team: TEAM @@ -81,7 +78,7 @@ For Thanos ruler we run some alerts in local Prometheus, to make sure that Thano action: 'check {{ $labels.kubernetes_pod_name }} pod in {{ $labels.kubernetes_namespace}} namespace' dashboard: RULE_DASHBOARD - alert: ThanosRuleIsDroppingAlerts - expr: rate(thanos_alert_queue_alerts_dropped_total{app="thanos-rule"}[5m]) > 0 + expr: rate(thanos_alert_queue_alerts_dropped_total{app="thanos-ruler"}[5m]) > 0 for: 5m labels: team: TEAM @@ -91,7 +88,7 @@ For Thanos ruler we run some alerts in local Prometheus, to make sure that Thano action: 'check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace' dashboard: RULE_DASHBOARD - alert: ThanosRuleGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-rule"}[5m]) > 0 + expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-ruler"}[5m]) > 0 for: 5m labels: team: TEAM @@ -104,32 +101,71 @@ For Thanos ruler we run some alerts in local Prometheus, to make sure that Thano ## Store Gateway -``` +[embedmd]:# (../tmp/thanos-store.rules.yaml yaml) +```yaml +name: thanos-store.rules +rules: - alert: ThanosStoreGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",app="thanos-store"}[5m]) > 0 + annotations: + message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m])) + * 100 > 5 + ) for: 5m labels: - team: TEAM + severity: warning +- alert: ThanosStoreSeriesGateLatencyHigh annotations: - summary: Thanos Store is returning Internal/Unavailable errors - impact: Long Term Storage Prometheus queries are failing - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: GATEWAY_URL -- alert: ThanosStoreBucketOperationsFailed - expr: rate(thanos_objstore_bucket_operation_failures_total{app="thanos-store"}[5m]) > 0 - for: 5m + message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for store series gate requests. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"})) > 2 + and + sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + ) + for: 10m labels: - team: TEAM + severity: warning +- alert: ThanosStoreBucketHighOperationFailures annotations: - summary: Thanos Store is failing to do bucket operations - impact: Long Term Storage Prometheus queries are failing - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: GATEWAY_URL + message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | + humanize }}% of operations. + expr: | + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + * 100 > 5 + ) + for: 15m + labels: + severity: warning +- alert: ThanosStoreObjstoreOperationLatencyHigh + annotations: + message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of + {{ $value }} seconds for the bucket operations. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"})) > 15 + and + sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + ) + for: 10m + labels: + severity: warning ``` ## Sidecar -``` +[//]: # "TODO(kakkoyun): Generate sidecar rules using thanos-mixin." + +```yaml - alert: ThanosSidecarPrometheusDown expr: thanos_sidecar_prometheus_up{name="prometheus"} == 0 for: 5m @@ -164,15 +200,229 @@ For Thanos ruler we run some alerts in local Prometheus, to make sure that Thano ## Query +[embedmd]:# (../tmp/thanos-querier.rules.yaml yaml) +```yaml +name: thanos-querier.rules +rules: +- alert: ThanosQuerierHttpRequestQueryErrorRateHigh + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query" requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-querier.*", handler="query"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-querier.*", handler="query"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical +- alert: ThanosQuerierHttpRequestQueryRangeErrorRateHigh + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query_range" requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-querier.*", handler="query_range"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-querier.*", handler="query_range"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical +- alert: ThanosQuerierGrpcServerErrorRate + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-querier.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning +- alert: ThanosQuerierGrpcClientErrorRate + annotations: + message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(grpc_client_started_total{job=~"thanos-querier.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning +- alert: ThanosQuerierHighDNSFailures + annotations: + message: Thanos Querys {{$labels.job}} have {{ $value }} of failing DNS queries. + expr: | + ( + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-querier.*"}[5m])) + > 1 + ) + for: 15m + labels: + severity: warning +- alert: ThanosQuerierInstantLatencyHigh + annotations: + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical +- alert: ThanosQuerierRangeLatencyHigh + annotations: + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query_range"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-querier.*", handler="query_range"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical +``` + +## Receive + +[embedmd]:# (../tmp/thanos-receiver.rules.yaml yaml) +```yaml +name: thanos-receiver.rules +rules: +- alert: ThanosReceiverHttpRequestErrorRateHigh + annotations: + message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-receiver.*", handler="receive"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-receiver.*", handler="receive"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical +- alert: ThanosReceiverHttpRequestLatencyHigh + annotations: + message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for requests. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-receiver.*", handler="receive"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receiver.*", handler="receive"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical +- alert: ThanosReceiverHighForwardRequestFailures + annotations: + message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receiver.*"}[5m])) + / + sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receiver.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: critical +- alert: ThanosReceiverHighHashringFileRefreshFailures + annotations: + message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ + $value | humanize }} of attempts failed. + expr: | + ( + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receiver.*"}[5m])) + / + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receiver.*"}[5m])) + > 0 + ) + for: 15m + labels: + severity: warning +- alert: ThanosReceiverConfigReloadFailure + annotations: + message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. + expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receiver.*"}) + by (job) != 1 + for: 5m + labels: + severity: warning ``` -- alert: ThanosQueryGrpcErrorRate - expr: rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",name="prometheus"}[5m]) > 0 + +## Extras + +### Absent Rules + +[embedmd]:# (../tmp/thanos-component-absent.rules.yaml yaml) +```yaml +name: thanos-component-absent.rules +rules: +- alert: ThanosCompactorIsDown + annotations: + message: ThanosCompactor has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-compactor.*"} == 1) for: 5m labels: - team: TEAM + severity: critical +- alert: ThanosQuerierIsDown annotations: - summary: Thanos Query is returning Internal/Unavailable errors - impact: Grafana is not showing metrics - action: Check {{ $labels.kubernetes_pod_name }} pod logs in {{ $labels.kubernetes_namespace}} namespace - dashboard: QUERY_URL + message: ThanosQuerier has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-querier.*"} == 1) + for: 5m + labels: + severity: critical +- alert: ThanosReceiverIsDown + annotations: + message: ThanosReceiver has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-receiver.*"} == 1) + for: 5m + labels: + severity: critical +- alert: ThanosRulerIsDown + annotations: + message: ThanosRuler has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-ruler.*"} == 1) + for: 5m + labels: + severity: critical +- alert: ThanosSidecarIsDown + annotations: + message: ThanosSidecar has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-sidecar.*"} == 1) + for: 5m + labels: + severity: critical +- alert: ThanosStoreIsDown + annotations: + message: ThanosStore has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-store.*"} == 1) + for: 5m + labels: + severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml new file mode 100644 index 0000000000..23f10b655f --- /dev/null +++ b/examples/alerts/alerts.yaml @@ -0,0 +1,329 @@ +groups: +- name: thanos-compactor.rules + rules: + - alert: ThanosCompactorMultipleCompactsAreRunning + annotations: + message: You should never run more than one Thanos Compact at once. You have + {{ $value }} + expr: sum(up{job=~"thanos-compactor.*"}) > 1 + for: 5m + labels: + severity: warning + - alert: ThanosCompactorHalted + annotations: + message: Thanos Compact {{$labels.job}} has failed to run and now is halted. + expr: thanos_compactor_halted{job=~"thanos-compactor.*"} == 1 + for: 5m + labels: + severity: warning + - alert: ThanosCompactorHighCompactionFailures + annotations: + message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize + }}% of compactions. + expr: | + ( + sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compactor.*"}[5m])) + / + sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compactor.*"}[5m])) + * 100 > 5 + ) + for: 15m + labels: + severity: warning + - alert: ThanosCompactorBucketHighOperationFailures + annotations: + message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. + expr: | + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compactor.*"}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compactor.*"}[5m])) + * 100 > 5 + ) + for: 15m + labels: + severity: warning + - alert: ThanosCompactorHasNotRun + annotations: + message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + expr: (time() - max(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compactor.*"})) + / 60 / 60 > 24 + labels: + severity: warning +- name: thanos-querier.rules + rules: + - alert: ThanosQuerierHttpRequestQueryErrorRateHigh + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query" requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-querier.*", handler="query"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-querier.*", handler="query"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical + - alert: ThanosQuerierHttpRequestQueryRangeErrorRateHigh + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of "query_range" requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-querier.*", handler="query_range"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-querier.*", handler="query_range"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical + - alert: ThanosQuerierGrpcServerErrorRate + annotations: + message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-querier.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosQuerierGrpcClientErrorRate + annotations: + message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(grpc_client_started_total{job=~"thanos-querier.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosQuerierHighDNSFailures + annotations: + message: Thanos Querys {{$labels.job}} have {{ $value }} of failing DNS queries. + expr: | + ( + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~"thanos-querier.*"}[5m])) + / + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~"thanos-querier.*"}[5m])) + > 1 + ) + for: 15m + labels: + severity: warning + - alert: ThanosQuerierInstantLatencyHigh + annotations: + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical + - alert: ThanosQuerierRangeLatencyHigh + annotations: + message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for instant queries. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query_range"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-querier.*", handler="query_range"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical +- name: thanos-receiver.rules + rules: + - alert: ThanosReceiverHttpRequestErrorRateHigh + annotations: + message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum(rate(http_requests_total{code=~"5..", job=~"thanos-receiver.*", handler="receive"}[5m])) + / + sum(rate(http_requests_total{job=~"thanos-receiver.*", handler="receive"}[5m])) + ) * 100 > 5 + for: 5m + labels: + severity: critical + - alert: ThanosReceiverHttpRequestLatencyHigh + annotations: + message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for requests. + expr: | + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{job=~"thanos-receiver.*", handler="receive"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receiver.*", handler="receive"}[5m])) > 0 + ) + for: 10m + labels: + severity: critical + - alert: ThanosReceiverHighForwardRequestFailures + annotations: + message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receiver.*"}[5m])) + / + sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receiver.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: critical + - alert: ThanosReceiverHighHashringFileRefreshFailures + annotations: + message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, + {{ $value | humanize }} of attempts failed. + expr: | + ( + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receiver.*"}[5m])) + / + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receiver.*"}[5m])) + > 0 + ) + for: 15m + labels: + severity: warning + - alert: ThanosReceiverConfigReloadFailure + annotations: + message: Thanos Receive {{$labels.job}} has not been able to reload hashring + configurations. + expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receiver.*"}) + by (job) != 1 + for: 5m + labels: + severity: warning +- name: thanos-sidecar.rules + rules: + - alert: ThanosSidecarUnhealthy + annotations: + message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ + $value }} seconds. + expr: | + count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + labels: + severity: critical +- name: thanos-store.rules + rules: + - alert: ThanosStoreGrpcErrorRate + annotations: + message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + }}% of requests. + expr: | + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) + / + sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m])) + * 100 > 5 + ) + for: 5m + labels: + severity: warning + - alert: ThanosStoreSeriesGateLatencyHigh + annotations: + message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value + }} seconds for store series gate requests. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"})) > 2 + and + sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + ) + for: 10m + labels: + severity: warning + - alert: ThanosStoreBucketHighOperationFailures + annotations: + message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. + expr: | + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + * 100 > 5 + ) + for: 15m + labels: + severity: warning + - alert: ThanosStoreObjstoreOperationLatencyHigh + annotations: + message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of + {{ $value }} seconds for the bucket operations. + expr: | + ( + histogram_quantile(0.9, sum by (job, le) (thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"})) > 15 + and + sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 + ) + for: 10m + labels: + severity: warning +- name: thanos-component-absent.rules + rules: + - alert: ThanosCompactorIsDown + annotations: + message: ThanosCompactor has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-compactor.*"} == 1) + for: 5m + labels: + severity: critical + - alert: ThanosQuerierIsDown + annotations: + message: ThanosQuerier has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-querier.*"} == 1) + for: 5m + labels: + severity: critical + - alert: ThanosReceiverIsDown + annotations: + message: ThanosReceiver has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-receiver.*"} == 1) + for: 5m + labels: + severity: critical + - alert: ThanosRulerIsDown + annotations: + message: ThanosRuler has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-ruler.*"} == 1) + for: 5m + labels: + severity: critical + - alert: ThanosSidecarIsDown + annotations: + message: ThanosSidecar has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-sidecar.*"} == 1) + for: 5m + labels: + severity: critical + - alert: ThanosStoreIsDown + annotations: + message: ThanosStore has disappeared from Prometheus target discovery. + expr: | + absent(up{job=~"thanos-store.*"} == 1) + for: 5m + labels: + severity: critical diff --git a/examples/alerts/rules.yaml b/examples/alerts/rules.yaml new file mode 100644 index 0000000000..82190e63c6 --- /dev/null +++ b/examples/alerts/rules.yaml @@ -0,0 +1,123 @@ +groups: +- name: thanos-querier.rules + rules: + - expr: | + ( + sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-querier.*", grpc_type="unary"}[5m])) + / + sum(rate(grpc_client_started_total{job=~"thanos-querier.*", grpc_type="unary"}[5m])) + ) + labels: {} + record: :grpc_client_failures_per_unary:sum_rate + - expr: | + ( + sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-querier.*", grpc_type="server_stream"}[5m])) + / + sum(rate(grpc_client_started_total{job=~"thanos-querier.*", grpc_type="server_stream"}[5m])) + ) + labels: {} + record: :grpc_client_failures_per_stream:sum_rate + - expr: | + ( + sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-querier.*"}[5m])) + / + sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-querier.*"}[5m])) + ) + labels: {} + record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate + - expr: | + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query"}[5m])) by (le) + ) + labels: + quantile: "0.99" + record: :query_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{job=~"thanos-querier.*", handler="query_range"}[5m])) by (le) + ) + labels: + quantile: "0.99" + record: :api_range_query_duration_seconds:histogram_quantile +- name: thanos-receiver.rules + rules: + - expr: | + sum( + rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receiver.*", grpc_type="unary"}[5m]) + / + rate(grpc_server_started_total{job=~"thanos-receiver.*", grpc_type="unary"}[5m]) + ) + labels: {} + record: :grpc_server_failures_per_unary:sum_rate + - expr: | + sum( + rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receiver.*", grpc_type="server_stream"}[5m]) + / + rate(grpc_server_started_total{job=~"thanos-receiver.*", grpc_type="server_stream"}[5m]) + ) + labels: {} + record: :grpc_server_failures_per_stream:sum_rate + - expr: | + sum( + rate(http_requests_total{handler="receive", job=~"thanos-receiver.*", code!~"5.."}[5m]) + / + rate(http_requests_total{handler="receive", job=~"thanos-receiver.*"}[5m]) + ) + labels: {} + record: :http_failure_per_request:sum_rate + - expr: | + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receiver.*"}[5m])) by (le) + ) + labels: + quantile: "0.99" + record: :http_request_duration_seconds:histogram_quantile + - expr: | + ( + sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receiver.*"}[5m])) + / + sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receiver.*"}[5m])) + ) + labels: {} + record: :thanos_receive_forward_failure_per_requests:sum_rate + - expr: | + ( + sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receiver.*"}[5m])) + / + sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receiver.*"}[5m])) + ) + labels: {} + record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate +- name: thanos-store.rules + rules: + - expr: | + ( + sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m])) + / + sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m])) + ) + labels: {} + record: :grpc_server_failures_per_unary:sum_rate + - expr: | + ( + sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + / + sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m])) + ) + labels: {} + record: :grpc_server_failures_per_stream:sum_rate + - expr: | + ( + sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) + / + sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m])) + ) + labels: {} + record: :thanos_objstore_bucket_failures_per_operation:sum_rate + - expr: | + histogram_quantile(0.99, + sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le) + ) + labels: + quantile: "0.99" + record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml new file mode 100644 index 0000000000..25df0414e4 --- /dev/null +++ b/examples/alerts/tests.yaml @@ -0,0 +1,94 @@ +rule_files: + - alerts.yaml + - rules.yaml + +evaluation_interval: 1m + +tests: +- interval: 1m + input_series: + - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' + values: '5 10 43 17 11 0 0 0' + - series: 'thanos_sidecar_last_heartbeat_success_time_seconds{namespace="production", job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' + values: '4 9 42 15 10 0 0 0' + promql_expr_test: + - expr: time() + eval_time: 1m + exp_samples: + - labels: '{}' + value: 60 + - expr: time() + eval_time: 2m + exp_samples: + - labels: '{}' + value: 120 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + eval_time: 2m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 43 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 42 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + eval_time: 5m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 0 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 0 + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + eval_time: 6m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 0 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 0 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + eval_time: 5m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 300 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 300 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + eval_time: 6m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 360 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 360 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) >= 300 + eval_time: 12m + exp_samples: + - labels: '{pod="thanos-sidecar-pod-0"}' + value: 720 + - labels: '{pod="thanos-sidecar-pod-1"}' + value: 720 + alert_rule_test: + - eval_time: 1m + alertname: ThanosSidecarUnhealthy + - eval_time: 2m + alertname: ThanosSidecarUnhealthy + - eval_time: 3m + alertname: ThanosSidecarUnhealthy + - eval_time: 5m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + exp_annotations: + message: 'Thanos Sidecar is unhealthy for 2 seconds.' + - eval_time: 6m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + exp_annotations: + message: 'Thanos Sidecar is unhealthy for 2 seconds.' + - eval_time: 12m + alertname: ThanosSidecarUnhealthy + exp_alerts: + - exp_labels: + severity: critical + exp_annotations: + message: 'Thanos Sidecar is unhealthy for 2 seconds.' diff --git a/examples/dashboards/compactor.json b/examples/dashboards/compactor.json new file mode 100644 index 0000000000..9e2d3d65cc --- /dev/null +++ b/examples/dashboards/compactor.json @@ -0,0 +1,1549 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "compaction {{job}} {{group}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_group_compactions_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Group Compaction", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for downsampling against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_downsample_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "downsample {{job}} {{group}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed downsampling against blocks that are stored in the bucket.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_downsample_failed_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_downsample_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Downsample", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for removals of blocks if their data is available as part of a block with a higher compaction level.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "garbage collection {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed garbage collections.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_garbage_collection_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute garbage collection in quantiles.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_compact_garbage_collection_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_compact_garbage_collection_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_compact_garbage_collection_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_compact_garbage_collection_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Garbage Collection", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for all meta files from blocks in the bucket into the memory.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_sync_meta_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "sync {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed meta file sync.", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_sync_meta_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_compact_sync_meta_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute meta file sync, in quantiles.", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_compact_sync_meta_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_compact_sync_meta_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_compact_sync_meta_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_compact_sync_meta_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Sync Meta", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for operations against the bucket.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed operations against the bucket.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute operations against the bucket, in quantiles.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Object Store Operations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-compactor.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-compactor.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-compactor.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Compactor", + "uid": "8378cc7f803b0bfae5809a101991ed76", + "version": 0 +} diff --git a/examples/dashboards/dashboards.md b/examples/dashboards/dashboards.md new file mode 100644 index 0000000000..36f13fe703 --- /dev/null +++ b/examples/dashboards/dashboards.md @@ -0,0 +1,18 @@ +# Dashboards + +There exists Grafana dashboards for each component (not all of them complete) targeted for environments running Kubernetes: + +- [Thanos Overview](thanos-overview.json) +- [Thanos Compact](thanos-compactor.json) +- [Thanos Querier](thanos-querier.json) +- [Thanos Store](thanos-store.json) +- [Thanos Receiver](thanos-receiver.json) +- [Thanos Sidecar](thanos-sidecar.json) +- [Thanos Ruler](thanos-ruler.json) + +You can import them via `Import -> Paste JSON` in Grafana. +These dashboards require Grafana 5 or above, importing them in older versions are known not to work. + +## Configuration + +All dashboards are generated using [`thanos-mixin`](../../jsonnet/thanos-mixin) and check out [README](../../jsonnet/thanos-mixin/README.md) for further information. diff --git a/examples/dashboards/overview.json b/examples/dashboards/overview.json new file mode 100644 index 0000000000..f328a3e370 --- /dev/null +++ b/examples/dashboards/overview.json @@ -0,0 +1,2066 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query for the given time.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Instant Query", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query_range for the given time range.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query_range\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query_range.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query_range\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Querier", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Querier", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-querier.*\",handler=\"query_range\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Range Query", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"thanos-store.*\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Store", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Store", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",namespace=\"$namespace\",job=~\"thanos-store.*\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "gRPC Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Store", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"thanos-sidecar.*\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Sidecar", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Sidecar", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",namespace=\"$namespace\",job=~\"thanos-sidecar.*\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "gPRC (Unary) Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Sidecar", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of incoming requests.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Receiver", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receiver", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receiver.*\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled incoming requests.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Receiver", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receiver", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receiver.*\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receiver.*\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle incoming requests.", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Receiver", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Receiver", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"thanos-receiver.*\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Incoming Requests Latency 99th Percentile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Receive", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of alerts that successfully sent to alert manager.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Ruler", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Ruler", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"thanos-ruler.*\"}[$interval])) by (job, alertmanager)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of sent alerts.", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Ruler", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Ruler", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_sender_errors_total{namespace=\"$namespace\",job=~\"thanos-ruler.*\"}[$interval])) / sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"thanos-ruler.*\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to send alerts to alert manager.", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "dashboard": "Thanos / Ruler", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Ruler", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"thanos-ruler.*\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} P99", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.5, + "yaxis": "left" + }, + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 1, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Alert Sent Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Rule", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Compactor", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Compactor", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"thanos-compactor.*\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "compaction {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Compaction Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.", + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + { + "dashboard": "Thanos / Compactor", + "includeVars": true, + "keepTime": true, + "title": "Thanos / Compactor", + "type": "dashboard" + } + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_compact_group_compactions_failures_total{namespace=\"$namespace\",job=~\"thanos-compactor.*\"}[$interval])) / sum(rate(thanos_compact_group_compactions_total{namespace=\"$namespace\",job=~\"thanos-compactor.*\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Compaction Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Compact", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Overview", + "uid": "0cb8830a6e957978796729870f560cda", + "version": 0 +} diff --git a/examples/dashboards/querier.json b/examples/dashboards/querier.json new file mode 100644 index 0000000000..f69e569ab8 --- /dev/null +++ b/examples/dashboards/querier.json @@ -0,0 +1,2481 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query for the given time.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(http_request_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Instant Query API", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query_range for the given time range.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests against /query_range.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(http_request_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",handler=\"query_range\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Range Query API", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of requests against /query for the given time, with handlers and codes.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests, in more detail.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{namespace=\"$namespace\",job=~\"$job\",code!~\"2..\"}[$interval])) by (job, handler, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{handler}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(http_request_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler) * 1 / sum(rate(http_request_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{handler}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{handler}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Query Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from other queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests from other queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_client_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from other queriers, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests, with grpc methods and codes from other queriers.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests from other queriers.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from other queriers, in quantiles.", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from other queriers.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests from other queriers.", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_client_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from other queriers, in quantiles", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests, with grpc methods and codes.", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the the total number of handled requests.", + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests in quantiles.", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of DNS lookups to discover stores.", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "lookups {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of failures compared to the the total number of executed DNS lookups.", + "fill": 10, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_querier_store_apis_dns_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "DNS", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-querier.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-querier.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-querier.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Querier", + "uid": "98fde97ddeaf2981041745f1f2ba68c2", + "version": 0 +} diff --git a/examples/dashboards/receiver.json b/examples/dashboards/receiver.json new file mode 100644 index 0000000000..3abba13206 --- /dev/null +++ b/examples/dashboards/receiver.json @@ -0,0 +1,2336 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of incoming requests.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled incoming requests.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\",code=~\"5..\"}[$interval])) / sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle incoming requests in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(http_request_duration_seconds_sum{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(http_request_duration_seconds_count{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Incoming Request", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "error": "#E24D42", + "success": "#7EB26D" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of incoming requests.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(label_replace(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval]),\"status_code\", \"${1}xx\", \"code\", \"([0-9])..\")) by (job, handler, status_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{status_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled incoming requests.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(http_requests_total{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\",code!~\"2..\"}[$interval])) by (job, handler, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{handler}} {{code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle incoming requests in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{handler}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(http_request_duration_seconds_sum{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler) * 1 / sum(rate(http_request_duration_seconds_count{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{handler}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{handler=\"receive\",namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, handler, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{handler}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of forwarded requests to other receive nodes.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "all {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\",result=\"error\"}[$interval])) / sum(rate(thanos_receive_forward_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Forward Request", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the relative time of last successful upload to the object-store bucket.", + "fill": 1, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Uploaded Ago", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace=\"$namespace\",job=~\"$job\"}) by (job, bucket)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Successful Upload", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Last Updated", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-receiver.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-receiver.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-receiver.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Receiver", + "uid": "b5958da86b143e45752506d3c09c4f92", + "version": 0 +} diff --git a/examples/dashboards/ruler.json b/examples/dashboards/ruler.json new file mode 100644 index 0000000000..fa2596da4a --- /dev/null +++ b/examples/dashboards/ruler.json @@ -0,0 +1,1846 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of dropped alerts.", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_sender_alerts_dropped_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, alertmanager)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Dropped Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of alerts that successfully sent to alert manager.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, alertmanager)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{alertmanager}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of sent alerts.", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_alert_sender_errors_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to send alerts to alert manager.", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_alert_sender_latency_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_alert_sender_latency_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_alert_sender_latency_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Sent Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alert Sent", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles.", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles.", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests.", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests, in quantiles", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-ruler.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-ruler.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-ruler.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Ruler", + "uid": "ecc62fafbb8ae0213cb9188ec4f3b553", + "version": 0 +} diff --git a/examples/dashboards/sidecar.json b/examples/dashboards/sidecar.json new file mode 100644 index 0000000000..f0d5bbb432 --- /dev/null +++ b/examples/dashboards/sidecar.json @@ -0,0 +1,1889 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows the relative time of last successful upload to the object-store bucket.", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Uploaded Ago", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace=\"$namespace\",job=~\"$job\"}) by (job, bucket)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Successful Upload", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Last Updated", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Operations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-sidecar.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-sidecar.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-sidecar.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Sidecar", + "uid": "b19644bfbf0ec1e108027cce268d99f7", + "version": 0 +} diff --git a/examples/dashboards/store.json b/examples/dashboards/store.json new file mode 100644 index 0000000000..4b0f0aaba9 --- /dev/null +++ b/examples/dashboards/store.json @@ -0,0 +1,3115 @@ +{ + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Unary)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Unary gRPC requests from queriers.", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"unary\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "Aborted": "#EAB839", + "AlreadyExists": "#7EB26D", + "Canceled": "#E24D42", + "DataLoss": "#E24D42", + "DeadlineExceeded": "#E24D42", + "FailedPrecondition": "#6ED0E0", + "Internal": "#E24D42", + "InvalidArgument": "#EF843C", + "NotFound": "#EF843C", + "OK": "#7EB26D", + "OutOfRange": "#E24D42", + "PermissionDenied": "#EF843C", + "ResourceExhausted": "#E24D42", + "Unauthenticated": "#EF843C", + "Unavailable": "#E24D42", + "Unimplemented": "#6ED0E0", + "Unknown": "#E24D42", + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) / sum(rate(grpc_server_started_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_server_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_server_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_server_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "gRPC (Stream)", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of handled Streamed gRPC requests from queriers.", + "fill": 10, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", + "fill": 10, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_client_handled_total{grpc_code!=\"OK\",namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, grpc_code)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{grpc_method}} {{grpc_code}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(grpc_client_handling_seconds_sum{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job) * 1\n/\nsum(rate(grpc_client_handling_seconds_count{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\",job=~\"$job\",grpc_type=\"server_stream\"}[$interval])) by (job, grpc_method, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}} {{grpc_method}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Detailed", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of execution for operations against the bucket.", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{operation}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of executed operations against the bucket.", + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to execute operations against the bucket, in quantiles.", + "fill": 1, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Duration", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bucket Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of block loads from the bucket.", + "fill": 10, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "block loads", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Load Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of block loads from the bucket.", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_bucket_store_block_load_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Load Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows rate of block drops.", + "fill": 10, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, operation)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "block drops {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Drop Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of block drops.", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_bucket_store_block_drop_failures_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) / sum(rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\",job=~\"$job\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block Drop Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Block Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of cache requests.", + "fill": 10, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_store_index_cache_requests_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows ratio of errors compared to the total number of cache hits.", + "fill": 10, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_store_index_cache_hits_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Hits", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of added items to cache.", + "fill": 10, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_store_index_cache_items_added_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Added", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Show rate of evicted items from cache.", + "fill": 10, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_store_index_cache_items_evicted_total{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, item_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}} {{item_type}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Evicted", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cache Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows size of chunks that have sent to the bucket, in bytes.", + "fill": 1, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean", + "legendLink": null, + "step": 10 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Chunk Size", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Store Sent", + "titleSize": "h6", + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_blocks_queried_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_blocks_queried_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Block queried", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_data_fetched_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_data_fetched_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Data Fetched", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 27, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.99\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_result_series_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_result_series_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\",job=~\"$job\",quantile=\"0.50\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Result series", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series Operations", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to get all series.", + "fill": 1, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_get_all_duration_seconds_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_get_all_duration_seconds_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Get All", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken to merge series.", + "fill": 1, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Merge", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Shows how long has it taken for a series to wait at the gate.", + "fill": 1, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P99 {{job}}", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket_sum{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job) * 1 / sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket_count{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "mean {{job}}", + "refId": "B", + "step": 10 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(thanos_bucket_store_series_gate_duration_seconds_bucket_bucket{namespace=\"$namespace\",job=~\"$job\"}[$interval])) by (job, le)) * 1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "P50 {{job}}", + "refId": "C", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Gate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series Operation Durations", + "titleSize": "h6" + }, + { + "collapse": true, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_alloc_bytes_total{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate all {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "rate(go_memstats_heap_alloc_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}[30s])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "alloc rate heap {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_stack_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse stack {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "go_memstats_heap_inuse_bytes{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inuse heap {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{namespace=\"$namespace\",job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_gc_duration_seconds{namespace=\"$namespace\",job=~\"$job\",kubernetes_pod_name=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}} {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GC Time Quantiles", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "thanos-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(kube_pod_info{}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": "thanos-store.*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": false, + "name": "job", + "options": [ ], + "query": "label_values(up{namespace=\"$namespace\",job=~\"thanos-store.*\"}, job)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\",created_by_name=~\"thanos-store.*\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Thanos / Store", + "uid": "e832e8f26403d95fac0ea1c59837588b", + "version": 0 +} diff --git a/examples/grafana/monitoring.md b/examples/grafana/monitoring.md deleted file mode 100644 index 8127f327e4..0000000000 --- a/examples/grafana/monitoring.md +++ /dev/null @@ -1,42 +0,0 @@ -# Grafana Dashboards - -There are 4 Grafana dashboards targeted for environments running Kubernetes: - -- [Thanos Compact](thanos-compact.json) -- [Thanos Query](thanos-query.json) -- [Thanos Store](thanos-store.json) -- [Thanos Sidecar](thanos-sidecar.json) -- [Thanos Rule](thanos-rule.json) - -You can import them via `Import -> Paste JSON` in Grafana. -These dashboards require Grafana 5, importing them in older versions are known not to work. - -# Configuration - -All dashboards can be configured via `labelselector` and `labelvalue` constants, which are used to pinpoint Thanos components. - -Let's say we have a service configured with following annotation: - -``` -apiVersion: v1 -kind: Service -metadata: - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "10902" - prometheus.io/scrape: "true" - labels: - name: prometheus - name: prometheus -spec: - ports: - - name: prometheus - port: 9090 - protocol: TCP - targetPort: 9090 - selector: - app: prometheus -``` - -In this case `labelselector` should be `name` and `labelvalue` should be `prometheus` as metrics will have `name="prometheus"` label associated with them. - diff --git a/examples/grafana/thanos-compact.json b/examples/grafana/thanos-compact.json deleted file mode 100644 index ff54cacac3..0000000000 --- a/examples/grafana/thanos-compact.json +++ /dev/null @@ -1,856 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_LABELSELECTOR", - "type": "constant", - "label": "labelselector", - "value": "app", - "description": "" - }, - { - "name": "VAR_LABELVALUE", - "type": "constant", - "label": "labelvalue", - "value": "thanos-compact", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "id": null, - "iteration": 1529906222108, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "repeat": null, - "title": "Thanos Compact", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(prometheus_tsdb_compactions_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "compaction {{kubernetes_namespace}}", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "bucket ops {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "sum(rate(thanos_compact_garbage_collection_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "gc ops {{kubernetes_namespace}}", - "refId": "C" - }, - { - "expr": "sum(rate(thanos_compact_group_compactions_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "group compact {{kubernetes_namespace}}", - "refId": "D" - }, - { - "expr": "sum(rate(thanos_compact_sync_meta_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "sync metas {{kubernetes_namespace}}", - "refId": "E" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operations/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(prometheus_tsdb_compactions_failed_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "compaction {{kubernetes_namespace}}", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "bucket ops {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "sum(rate(thanos_compact_garbage_collection_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "gc ops {{kubernetes_namespace}}", - "refId": "C" - }, - { - "expr": "sum(rate(thanos_compact_group_compactions_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "group compact {{kubernetes_namespace}}", - "refId": "D" - }, - { - "expr": "sum(rate(thanos_compact_sync_meta_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "sync metas {{kubernetes_namespace}}", - "refId": "E" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operation Failures/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_compact_garbage_collection_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "99.99 gc {{kubernetes_namespace}}", - "refId": "A", - "step": 2 - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_compact_sync_meta_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "99.99 sync meta {{kubernetes_namespace}}", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "99.99 bucket ops {{kubernetes_namespace}}", - "refId": "C" - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(prometheus_tsdb_compaction_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 compact {{kubernetes_namespace}}", - "refId": "D" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operation Time Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 1, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 16 - }, - "id": 25, - "panels": [], - "repeat": null, - "title": "Ops", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_memstats_heap_alloc_bytes{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Used", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_gc_duration_seconds{$labelselector=\"$labelvalue\", quantile=\"1\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}} ", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "GC Time Quantiles", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": "30s", - "revision": null, - "schemaVersion": 16, - "style": "dark", - "tags": [ - "thanos" - ], - "templating": { - "list": [ - { - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": null, - "name": "interval", - "options": [ - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - }, - { - "selected": false, - "text": "6h", - "value": "6h" - }, - { - "selected": false, - "text": "12h", - "value": "12h" - }, - { - "selected": false, - "text": "1d", - "value": "1d" - }, - { - "selected": false, - "text": "7d", - "value": "7d" - }, - { - "selected": false, - "text": "14d", - "value": "14d" - }, - { - "selected": false, - "text": "30d", - "value": "30d" - } - ], - "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", - "refresh": 2, - "type": "interval" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kubernetes_namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - }, - "hide": 2, - "label": null, - "name": "labelselector", - "options": [ - { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - } - ], - "query": "${VAR_LABELSELECTOR}", - "type": "constant" - }, - { - "current": { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - }, - "hide": 2, - "label": null, - "name": "labelvalue", - "options": [ - { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - } - ], - "query": "${VAR_LABELVALUE}", - "type": "constant" - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Thanos Compaction", - "uid": "s48S7j4ik", - "version": 3 -} diff --git a/examples/grafana/thanos-query.json b/examples/grafana/thanos-query.json deleted file mode 100644 index 424437e554..0000000000 --- a/examples/grafana/thanos-query.json +++ /dev/null @@ -1,1038 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_LABELVALUE", - "type": "constant", - "label": "labelvalue", - "value": "thanos-query", - "description": "" - }, - { - "name": "VAR_LABELSELECTOR", - "type": "constant", - "label": "labelselector", - "value": "app", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "id": null, - "iteration": 1529906270695, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "repeat": null, - "title": "Thanos Query", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 1 - }, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_client_handled_total{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\"}[$interval])) by (kubernetes_pod_name, grpc_code, grpc_method)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{grpc_code}} {{grpc_method}} {{kubernetes_pod_name}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Request RPS", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 1, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "reqps", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(grpc_client_handling_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\"}[$interval])) by (grpc_method,kubernetes_pod_name, le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 {{grpc_method}} {{kubernetes_pod_name}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Response Time Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 15 - }, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(http_request_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\",handler=\"query\"}[$interval])) by (kubernetes_pod_name, le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "instant_query {{kubernetes_pod_name}}", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(http_request_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\",handler=\"query_range\"}[$interval])) by (kubernetes_pod_name, le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "range_query {{kubernetes_pod_name}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Thanos Query 99.99 Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 22 - }, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "prometheus_engine_query_duration_seconds{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\",quantile=\"0.99\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{kubernetes_pod_name}} {{slice}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Prometheus Query 99 Quantile", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 29 - }, - "id": 29, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "prometheus_engine_queries{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\"}", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "legendFormat": "{{kubernetes_pod_name}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Prometheus Queries/s", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "columns": [], - "datasource": "${DS_PROMETHEUS}", - "fontSize": "100%", - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 29 - }, - "hideTimeOverride": false, - "id": 34, - "links": [], - "pageSize": null, - "scroll": false, - "showHeader": true, - "sort": { - "col": 1, - "desc": false - }, - "styles": [ - { - "alias": "Peer", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "external_labels", - "thresholds": [], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "Time", - "thresholds": [], - "type": "hidden", - "unit": "short" - }, - { - "alias": "Replicas", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "Value", - "thresholds": [], - "type": "hidden", - "unit": "short" - } - ], - "targets": [ - { - "expr": "min(thanos_store_node_info{$labelselector=\"$labelvalue\"}) by (external_labels)", - "format": "table", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "title": "Gossip info", - "transform": "table", - "type": "table" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 38 - }, - "id": 25, - "panels": [], - "repeat": null, - "title": "Ops", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 39 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_memstats_heap_alloc_bytes{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Used", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 39 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{$labelselector=\"$labelvalue\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 46 - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_gc_duration_seconds{$labelselector=\"$labelvalue\",kubernetes_pod_name=~\"$pod\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{quantile}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "GC Time Quantiles", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": "30s", - "revision": null, - "schemaVersion": 16, - "style": "dark", - "tags": [ - "thanos" - ], - "templating": { - "list": [ - { - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": null, - "name": "interval", - "options": [ - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - }, - { - "selected": false, - "text": "6h", - "value": "6h" - }, - { - "selected": false, - "text": "12h", - "value": "12h" - }, - { - "selected": false, - "text": "1d", - "value": "1d" - }, - { - "selected": false, - "text": "7d", - "value": "7d" - }, - { - "selected": false, - "text": "14d", - "value": "14d" - }, - { - "selected": false, - "text": "30d", - "value": "30d" - } - ], - "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", - "refresh": 2, - "type": "interval" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "pod", - "multi": false, - "name": "pod", - "options": [], - "query": "label_values(thanos_build_info{$labelselector=\"$labelvalue\"}, kubernetes_pod_name)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - }, - "hide": 2, - "label": null, - "name": "labelvalue", - "options": [ - { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - } - ], - "query": "${VAR_LABELVALUE}", - "type": "constant" - }, - { - "current": { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - }, - "hide": 2, - "label": null, - "name": "labelselector", - "options": [ - { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - } - ], - "query": "${VAR_LABELSELECTOR}", - "type": "constant" - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Thanos Query", - "uid": "opwl5gSiz", - "version": 2 -} diff --git a/examples/grafana/thanos-rule.json b/examples/grafana/thanos-rule.json deleted file mode 100644 index 3ac26f0530..0000000000 --- a/examples/grafana/thanos-rule.json +++ /dev/null @@ -1,1205 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_LABELSELECTOR", - "type": "constant", - "label": "labelselector", - "value": "app", - "description": "" - }, - { - "name": "VAR_LABELVALUE", - "type": "constant", - "label": "labelvalue", - "value": "thanos-rule", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.4" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "id": null, - "iteration": 1530195209483, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "repeat": null, - "title": "Thanos Rule", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_handled_total{kubernetes_namespace=~\"$namespace\",$labelselector=\"$labelvalue\"}[$interval])) by (grpc_code, grpc_method, kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{grpc_code}} {{kubernetes_namespace}}/{{grpc_method}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPS [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(grpc_server_handling_seconds_bucket{kubernetes_namespace=~\"$namespace\",$labelselector=\"$labelvalue\"}[$interval])) by (grpc_method, le, kubernetes_namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 {{kubernetes_namespace}}/{{grpc_method}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Query Response Time Quantile [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 33, - "panels": [], - "title": "Alert Sender", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(thanos_alert_sender_alerts_sent_total{kubernetes_namespace=~\"$namespace\",$labelselector=\"$labelvalue\"}[$interval])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{alertmanager}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Alerts Sent Rate [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 9 - }, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(thanos_alert_sender_alerts_dropped_total{kubernetes_namespace=~\"$namespace\",$labelselector=\"$labelvalue\"}[$interval])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{alertmanager}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Alerts Dropped Rate [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 16 - }, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_alert_sender_latency_seconds_bucket{kubernetes_namespace=~\"$namespace\",$labelselector=\"$labelvalue\"}[$interval])) by (alertmanager, le, kubernetes_namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 {{kubernetes_namespace}}/{{alertmanager}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Alert Sender Latency Quantile [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 23 - }, - "id": 41, - "panels": [], - "title": "Compaction", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 39, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(prometheus_tsdb_compactions_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Compaction Rate [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 43, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(prometheus_tsdb_compactions_failed_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Compaction Failure Rate [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 31 - }, - "id": 45, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(prometheus_tsdb_compaction_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Compaction Duration Quantile [$interval]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 38 - }, - "id": 25, - "panels": [], - "repeat": null, - "title": "Ops", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 39 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_memstats_heap_alloc_bytes{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Used", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 39 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 46 - }, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_gc_duration_seconds{$labelselector=\"$labelvalue\", quantile=\"1\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}} ", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "GC Time Quantiles", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": "30s", - "revision": null, - "schemaVersion": 16, - "style": "dark", - "tags": [ - "thanos" - ], - "templating": { - "list": [ - { - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": null, - "name": "interval", - "options": [ - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - }, - { - "selected": false, - "text": "6h", - "value": "6h" - }, - { - "selected": false, - "text": "12h", - "value": "12h" - }, - { - "selected": false, - "text": "1d", - "value": "1d" - }, - { - "selected": false, - "text": "7d", - "value": "7d" - }, - { - "selected": false, - "text": "14d", - "value": "14d" - }, - { - "selected": false, - "text": "30d", - "value": "30d" - } - ], - "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", - "refresh": 2, - "type": "interval" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kubernetes_namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - }, - "hide": 2, - "label": null, - "name": "labelselector", - "options": [ - { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - } - ], - "query": "${VAR_LABELSELECTOR}", - "type": "constant" - }, - { - "current": { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - }, - "hide": 2, - "label": null, - "name": "labelvalue", - "options": [ - { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - } - ], - "query": "${VAR_LABELVALUE}", - "type": "constant" - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Thanos Rule", - "uid": "rjUCNfHmz", - "version": 3 -} \ No newline at end of file diff --git a/examples/grafana/thanos-sidecar.json b/examples/grafana/thanos-sidecar.json deleted file mode 100644 index fb91de63b7..0000000000 --- a/examples/grafana/thanos-sidecar.json +++ /dev/null @@ -1,1134 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_LABELVALUE", - "type": "constant", - "label": "labelvalue", - "value": "prometheus", - "description": "" - }, - { - "name": "VAR_LABELSELECTOR", - "type": "constant", - "label": "labelselector", - "value": "name", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "id": null, - "iteration": 1529906347607, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "repeat": null, - "title": "Thanos Sidecar", - "type": "row" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "dateTimeFromNow", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 3, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 29, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 2, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "repeat": null, - "repeatDirection": "h", - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(thanos_objstore_bucket_last_successful_upload_time{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"})*1000", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "title": "Last Upload time", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "dateTimeFromNow", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 3, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 30, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 2, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "repeatDirection": "h", - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(thanos_sidecar_last_heartbeat_success_time_seconds{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"})*1000", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "title": "Last Heartbeat time", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 4 - }, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_handled_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (grpc_code, grpc_method, kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{grpc_code}} {{grpc_method}} {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Query RPS [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 11 - }, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(grpc_server_handling_seconds_bucket{$labelselector=\"$labelvalue\"}[$interval])) by (grpc_method, le, kubernetes_namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 {{grpc_method}} {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Query Response Time Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 18 - }, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace, operation)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{operation}} {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Bucket Operations/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 18 - }, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (operation,kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{operation}} {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Bucket Operation Failures/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 25 - }, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "99.99 bucket ops {{kubernetes_namespace}}", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Bucket Operation Time Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 1, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 25, - "panels": [], - "repeat": null, - "title": "Ops", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 34 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_memstats_heap_alloc_bytes{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Used", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 41 - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_gc_duration_seconds{$labelselector=\"$labelvalue\", quantile=\"1\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{quantile}} {{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - }, - { - "expr": "", - "format": "time_series", - "intervalFactor": 1, - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "GC Time Quantiles", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": "30s", - "revision": null, - "schemaVersion": 16, - "style": "dark", - "tags": [ - "thanos" - ], - "templating": { - "list": [ - { - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": null, - "name": "interval", - "options": [ - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - }, - { - "selected": false, - "text": "6h", - "value": "6h" - }, - { - "selected": false, - "text": "12h", - "value": "12h" - }, - { - "selected": false, - "text": "1d", - "value": "1d" - }, - { - "selected": false, - "text": "7d", - "value": "7d" - }, - { - "selected": false, - "text": "14d", - "value": "14d" - }, - { - "selected": false, - "text": "30d", - "value": "30d" - } - ], - "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", - "refresh": 2, - "type": "interval" - }, - { - "current": { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - }, - "hide": 2, - "label": "", - "name": "labelvalue", - "options": [ - { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - } - ], - "query": "${VAR_LABELVALUE}", - "type": "constant" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kubernetes_namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - }, - "hide": 2, - "label": null, - "name": "labelselector", - "options": [ - { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - } - ], - "query": "${VAR_LABELSELECTOR}", - "type": "constant" - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Thanos Sidecar", - "uid": "IOteEKHik", - "version": 2 -} \ No newline at end of file diff --git a/examples/grafana/thanos-store.json b/examples/grafana/thanos-store.json deleted file mode 100644 index 4027c2da4d..0000000000 --- a/examples/grafana/thanos-store.json +++ /dev/null @@ -1,1299 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_LABELSELECTOR", - "type": "constant", - "label": "labelselector", - "value": "app", - "description": "" - }, - { - "name": "VAR_LABELVALUE", - "type": "constant", - "label": "labelvalue", - "value": "thanos-store", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 1, - "id": null, - "iteration": 1529906350696, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "repeat": null, - "title": "thanos-store", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 1 - }, - "id": 28, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_handled_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (grpc_code, grpc_method, kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{grpc_code}} {{grpc_method}} {{kubernetes_namespace}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Query RPS [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(grpc_server_handling_seconds_bucket{$labelselector=\"$labelvalue\"}[$interval])) by (grpc_method, le, kubernetes_namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{grpc_method}} {{kubernetes_namespace}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Response Time 99.99 Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 15 - }, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{$labelselector=\"$labelvalue\"}[$interval])) by (le, kubernetes_namespace))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{kubernetes_namespace}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Response Size 99.99 Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 22 - }, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(thanos_objstore_bucket_operations_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace, operation)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "bucket {{operation}} {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "sum(rate(thanos_bucket_store_block_drops_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "block drops {{kubernetes_namespace}}", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_bucket_store_block_loads_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "block loads {{kubernetes_namespace}}", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operations/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 22 - }, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(thanos_objstore_bucket_operation_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (operation,kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "bucket {{operation}} {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "sum(rate(thanos_bucket_store_block_drop_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "block drop {{kubernetes_namespace}}", - "refId": "B" - }, - { - "expr": "sum(rate(thanos_bucket_store_block_load_failures_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "block load {{kubernetes_namespace}}", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operation Failures/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 29 - }, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le, operation))", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "99.99 bucket {{operation}} {{kubernetes_namespace}}", - "refId": "C" - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_bucket_store_series_get_all_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 get all {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.9999, sum(rate(thanos_bucket_store_series_merge_duration_seconds_bucket{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.99 merge {{kubernetes_namespace}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Operation Time Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 1, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 29 - }, - "id": 30, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(thanos_store_index_cache_items_added_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,item_type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "added {{item_type}} {{kubernetes_namespace}}", - "refId": "D" - }, - { - "expr": "sum(rate(thanos_store_index_cache_items_evicted_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,item_type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "evicted {{item_type}} {{kubernetes_namespace}}", - "refId": "E" - }, - { - "expr": "sum(rate(thanos_store_index_cache_requests_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace,item_type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "requests {{item_type}} {{kubernetes_namespace}}", - "refId": "F" - }, - { - "expr": "sum(rate(thanos_store_index_cache_hits_total{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}[$interval])) by (kubernetes_namespace, item_type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "hits {{item_type}} {{kubernetes_namespace}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Cache Ops/s [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "thanos_bucket_store_series_blocks_queried{$labelselector=\"$labelvalue\",quantile=\"0.99\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "blocks queried {{kubernetes_pod_name}} {{kubernetes_namespace}}", - "refId": "C" - }, - { - "expr": "thanos_bucket_store_series_data_fetched{$labelselector=\"$labelvalue\",quantile=\"0.99\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "data fetched {{kubernetes_pod_name}} {{kubernetes_namespace}}", - "refId": "A" - }, - { - "expr": "thanos_bucket_store_series_result_series{$labelselector=\"$labelvalue\",quantile=\"0.99\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "result series {{kubernetes_pod_name}} {{kubernetes_namespace}}", - "refId": "D" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod Operation Time 99th Quantile [$interval]", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 1, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 43 - }, - "id": 25, - "panels": [], - "repeat": null, - "title": "Ops", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_memstats_heap_alloc_bytes{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory Used", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{$labelselector=\"$labelvalue\",kubernetes_namespace=~\"$namespace\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}}", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 51 - }, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_gc_duration_seconds{$labelselector=\"$labelvalue\", quantile=\"1\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{kubernetes_namespace}} {{kubernetes_pod_name}} ", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "GC Time Quantiles", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": "30s", - "revision": null, - "schemaVersion": 16, - "style": "dark", - "tags": [ - "thanos" - ], - "templating": { - "list": [ - { - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": null, - "name": "interval", - "options": [ - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - }, - { - "selected": false, - "text": "6h", - "value": "6h" - }, - { - "selected": false, - "text": "12h", - "value": "12h" - }, - { - "selected": false, - "text": "1d", - "value": "1d" - }, - { - "selected": false, - "text": "7d", - "value": "7d" - }, - { - "selected": false, - "text": "14d", - "value": "14d" - }, - { - "selected": false, - "text": "30d", - "value": "30d" - } - ], - "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", - "refresh": 2, - "type": "interval" - }, - { - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(kubernetes_namespace)", - "refresh": 1, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - }, - "hide": 2, - "label": null, - "name": "labelselector", - "options": [ - { - "value": "${VAR_LABELSELECTOR}", - "text": "${VAR_LABELSELECTOR}" - } - ], - "query": "${VAR_LABELSELECTOR}", - "type": "constant" - }, - { - "current": { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - }, - "hide": 2, - "label": null, - "name": "labelvalue", - "options": [ - { - "value": "${VAR_LABELVALUE}", - "text": "${VAR_LABELVALUE}" - } - ], - "query": "${VAR_LABELVALUE}", - "type": "constant" - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Thanos Store", - "uid": "KqPVKRIiz", - "version": 2 -} \ No newline at end of file diff --git a/jsonnetfile.json b/jsonnetfile.json new file mode 100644 index 0000000000..1bcb73e8a9 --- /dev/null +++ b/jsonnetfile.json @@ -0,0 +1,33 @@ +{ + "dependencies": [ + { + "name": "thanos-mixin", + "source": { + "local": { + "directory": "mixin/thanos" + } + }, + "version": "." + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json new file mode 100644 index 0000000000..ae23cf5774 --- /dev/null +++ b/jsonnetfile.lock.json @@ -0,0 +1,35 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "f4c59f64f80442f871a06c91edf74d014b82acaf", + "sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "69bc267211790a1c3f4ea6e6211f3e8ffe22f987", + "sum": "BjHfWzqSAgtAKEVD6ipoYOkb8XT5wSBIboY4ZLwhlOU=" + }, + { + "name": "thanos-mixin", + "source": { + "local": { + "directory": "mixin/thanos" + } + }, + "version": "" + } + ] +} diff --git a/mixin/thanos-grafana-builder/builder.libsonnet b/mixin/thanos-grafana-builder/builder.libsonnet new file mode 100644 index 0000000000..a68e8065b3 --- /dev/null +++ b/mixin/thanos-grafana-builder/builder.libsonnet @@ -0,0 +1,191 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local template = grafana.template; + +(import 'grafana-builder/grafana.libsonnet') + +{ + collapse: { + collapse: true, + }, + + panel(title, description=null):: + super.panel(title) { [if description != null then 'description']: description }, + + addDashboardLink(name): { + links+: [ + { + dashboard: name, + includeVars: true, + keepTime: true, + title: name, + type: 'dashboard', + }, + ], + }, + + template(name, metricName, selector='', includeAll=false, allValues=''):: + local t = if includeAll then + template.new( + name, + '$datasource', + 'label_values(%s{%s}, %s)' % [metricName, selector, name], + label=name, + refresh=1, + sort=2, + current='all', + allValues=allValues, + includeAll=true + ) + else + template.new( + name, + '$datasource', + 'label_values(%s{%s}, %s)' % [metricName, selector, name], + label=name, + refresh=1, + sort=2, + ); + + { + templating+: { + list+: [ + t, + ], + }, + }, + + spanSize(size):: { + span: size, + }, + + postfix(postfix):: { + postfix: postfix, + }, + + sparkline:: { + sparkline: { + show: true, + lineColor: 'rgb(31, 120, 193)', + fillColor: 'rgba(31, 118, 189, 0.18)', + }, + }, + + latencyPanel(metricName, selector, multiplier='1'):: { + nullPointMode: 'null as zero', + targets: [ + { + expr: 'histogram_quantile(0.99, sum(rate(%s_bucket{%s}[$interval])) by (job, le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'P99 {{job}}', + refId: 'A', + step: 10, + }, + { + expr: 'sum(rate(%s_sum{%s}[$interval])) by (job) * %s / sum(rate(%s_count{%s}[$interval])) by (job)' % [metricName, selector, multiplier, metricName, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'mean {{job}}', + refId: 'B', + step: 10, + }, + { + expr: 'histogram_quantile(0.50, sum(rate(%s_bucket{%s}[$interval])) by (job, le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'P50 {{job}}', + refId: 'C', + step: 10, + }, + ], + yaxes: $.yaxes('s'), + }, + + qpsErrTotalPanel(selectorErr, selectorTotal):: { + local expr(selector) = 'sum(rate(' + selector + '[$interval]))', // {{job}} + + aliasColors: { + 'error': '#E24D42', + }, + targets: [ + { + expr: '%s / %s' % [expr(selectorErr), expr(selectorTotal)], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'error', + refId: 'A', + step: 10, + }, + ], + yaxes: $.yaxes({ format: 'percentunit' }), + } + $.stack, + + qpsSuccErrRatePanel(selectorErr, selectorTotal):: { + local expr(selector) = 'sum(rate(' + selector + '[$interval]))', // {{job}} + + aliasColors: { + success: '#7EB26D', + 'error': '#E24D42', + }, + targets: [ + { + expr: '%s / %s' % [expr(selectorErr), expr(selectorTotal)], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'error', + refId: 'A', + step: 10, + }, + { + expr: '(%s - %s) / %s' % [expr(selectorTotal), expr(selectorErr), expr(selectorTotal)], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'success', + refId: 'B', + step: 10, + }, + ], + yaxes: $.yaxes({ format: 'percentunit', max: 1 }), + } + $.stack, + + resourceUtilizationRow():: + $.row('Resources') + .addPanel( + $.panel('Memory Used') + + $.queryPanel( + [ + 'go_memstats_alloc_bytes{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}', + 'go_memstats_heap_alloc_bytes{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}', + 'rate(go_memstats_alloc_bytes_total{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}[30s])', + 'rate(go_memstats_heap_alloc_bytes{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}[30s])', + 'go_memstats_stack_inuse_bytes{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}', + 'go_memstats_heap_inuse_bytes{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}', + ], + [ + 'alloc all {{pod}}', + 'alloc heap {{pod}}', + 'alloc rate all {{pod}}', + 'alloc rate heap {{pod}}', + 'inuse stack {{pod}}', + 'inuse heap {{pod}}', + ] + ), + ) + .addPanel( + $.panel('Goroutines') + + $.queryPanel( + 'go_goroutines{namespace="$namespace",job=~"$job"}', + '{{pod}}' + ) + ) + .addPanel( + $.panel('GC Time Quantiles') + + $.queryPanel( + 'go_gc_duration_seconds{namespace="$namespace",job=~"$job",kubernetes_pod_name=~"$pod"}', + '{{quantile}} {{pod}}' + ) + ) + + $.collapse, +} + +(import 'grpc.libsonnet') + +(import 'http.libsonnet') + +(import 'slo.libsonnet') diff --git a/mixin/thanos-grafana-builder/grpc.libsonnet b/mixin/thanos-grafana-builder/grpc.libsonnet new file mode 100644 index 0000000000..a3a981fd18 --- /dev/null +++ b/mixin/thanos-grafana-builder/grpc.libsonnet @@ -0,0 +1,107 @@ +{ + grpcQpsPanel(type, selector):: { + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server', + + aliasColors: { + Aborted: '#EAB839', + AlreadyExists: '#7EB26D', + FailedPrecondition: '#6ED0E0', + Unimplemented: '#6ED0E0', + InvalidArgument: '#EF843C', + NotFound: '#EF843C', + PermissionDenied: '#EF843C', + Unauthenticated: '#EF843C', + Canceled: '#E24D42', + DataLoss: '#E24D42', + DeadlineExceeded: '#E24D42', + Internal: '#E24D42', + OutOfRange: '#E24D42', + ResourceExhausted: '#E24D42', + Unavailable: '#E24D42', + Unknown: '#E24D42', + OK: '#7EB26D', + 'error': '#E24D42', + }, + targets: [ + { + expr: 'sum(rate(%s_handled_total{%s}[$interval])) by (job, grpc_code)' % [prefix, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: '{{job}} {{grpc_code}}', + refId: 'A', + step: 10, + }, + ], + } + $.stack, + + grpcQpsPanelDetailed(type, selector):: { + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server', + targets: [ + { + expr: 'sum(rate(%s_handled_total{%s}[$interval])) by (job, grpc_method, grpc_code)' % [prefix, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: '{{job}} {{grpc_method}} {{grpc_code}}', + refId: 'A', + step: 10, + }, + ], + } + $.stack, + + grpcErrorsPanel(type, selector):: + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; + $.qpsErrTotalPanel( + '%s_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable",%s}' % [prefix, selector], + '%s_started_total{%s}' % [prefix, selector], + ), + + grpcErrDetailsPanel(type, selector):: + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; + $.queryPanel( + ||| + sum(rate(%s_handled_total{grpc_code!="OK",%s}[$interval])) by (job, grpc_method, grpc_code) + ||| % [prefix, selector], + '{{job}} {{grpc_method}} {{grpc_code}}' + ) + + $.stack, + + grpcLatencyPanel(type, selector, multiplier='1'):: + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; + $.queryPanel( + [ + 'histogram_quantile(0.99, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, le)) * %s' % [prefix, selector, multiplier], + ||| + sum(rate(%s_handling_seconds_sum{%s}[$interval])) by (job) * %s + / + sum(rate(%s_handling_seconds_count{%s}[$interval])) by (job) + ||| % [prefix, selector, multiplier, prefix, selector], + 'histogram_quantile(0.50, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, le)) * %s' % [prefix, selector, multiplier], + ], + [ + 'P99 {{job}}', + 'mean {{job}}', + 'P50 {{job}}', + ] + ) + + { yaxes: $.yaxes('s') }, + + grpcLatencyPanelDetailed(type, selector, multiplier='1'):: + local prefix = if type == 'client' then 'grpc_client' else 'grpc_server'; + $.queryPanel( + [ + 'histogram_quantile(0.99, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, grpc_method, le)) * %s' % [prefix, selector, multiplier], + ||| + sum(rate(%s_handling_seconds_sum{%s}[$interval])) by (job) * %s + / + sum(rate(%s_handling_seconds_count{%s}[$interval])) by (job) + ||| % [prefix, selector, multiplier, prefix, selector], + 'histogram_quantile(0.50, sum(rate(%s_handling_seconds_bucket{%s}[$interval])) by (job, grpc_method, le)) * %s' % [prefix, selector, multiplier], + ], + [ + 'P99 {{job}} {{grpc_method}}', + 'mean {{job}} {{grpc_method}}', + 'P50 {{job}} {{grpc_method}}', + ] + ) + + { yaxes: $.yaxes('s') }, +} diff --git a/mixin/thanos-grafana-builder/http.libsonnet b/mixin/thanos-grafana-builder/http.libsonnet new file mode 100644 index 0000000000..a03682b3dd --- /dev/null +++ b/mixin/thanos-grafana-builder/http.libsonnet @@ -0,0 +1,82 @@ +{ + httpQpsPanel(metricName, selector):: { + aliasColors: { + '1xx': '#EAB839', + '2xx': '#7EB26D', + '3xx': '#6ED0E0', + '4xx': '#EF843C', + '5xx': '#E24D42', + success: '#7EB26D', + 'error': '#E24D42', + }, + targets: [ + { + expr: 'sum(label_replace(rate(%s{%s}[$interval]),"status_code", "${1}xx", "code", "([0-9])..")) by (job, status_code)' % [metricName, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: '{{job}} {{status_code}}', + refId: 'A', + step: 10, + }, + ], + } + $.stack, + + httpQpsPanelDetailed(metricName, selector):: + $.httpQpsPanel(metricName, selector) { + targets: [ + { + expr: 'sum(label_replace(rate(%s{%s}[$interval]),"status_code", "${1}xx", "code", "([0-9])..")) by (job, handler, status_code)' % [metricName, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: '{{job}} {{handler}} {{status_code}}', + refId: 'A', + step: 10, + }, + ], + }, + + httpErrPanel(metricName, selector):: + $.qpsErrTotalPanel( + '%s{%s,code=~"5.."}' % [metricName, selector], + '%s{%s}' % [metricName, selector], + ), + + httpErrDetailsPanel(metricName, selector):: + $.queryPanel( + 'sum(rate(%s{%s,code!~"2.."}[$interval])) by (job, handler, code)' % [metricName, selector], + '{{job}} {{handler}} {{code}}' + ) + + { yaxes: $.yaxes({ format: 'percentunit' }) } + + $.stack, + + httpLatencyDetailsPanel(metricName, selector, multiplier='1'):: { + nullPointMode: 'null as zero', + targets: [ + { + expr: 'histogram_quantile(0.99, sum(rate(%s_bucket{%s}[$interval])) by (job, handler, le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'P99 {{job}} {{handler}}', + refId: 'A', + step: 10, + }, + { + expr: 'sum(rate(%s_sum{%s}[$interval])) by (job, handler) * %s / sum(rate(%s_count{%s}[$interval])) by (job, handler)' % [metricName, selector, multiplier, metricName, selector], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'mean {{job}} {{handler}}', + refId: 'B', + step: 10, + }, + { + expr: 'histogram_quantile(0.50, sum(rate(%s_bucket{%s}[$interval])) by (job, handler, le)) * %s' % [metricName, selector, multiplier], + format: 'time_series', + intervalFactor: 2, + legendFormat: 'P50 {{job}} {{handler}}', + refId: 'C', + step: 10, + }, + ], + yaxes: $.yaxes('s'), + }, +} diff --git a/mixin/thanos-grafana-builder/slo.libsonnet b/mixin/thanos-grafana-builder/slo.libsonnet new file mode 100644 index 0000000000..bee885037a --- /dev/null +++ b/mixin/thanos-grafana-builder/slo.libsonnet @@ -0,0 +1,29 @@ +{ + sloLatency(title, description, selector, quantile, warning, critical):: + $.panel(title, description) + + $.queryPanel( + 'histogram_quantile(%.2f, sum(rate(%s[$interval])) by (job, le))' % [quantile, selector], + '{{job}} P' + quantile * 100 + ) + + { + yaxes: $.yaxes('s'), + thresholds+: [ + { + value: warning, + colorMode: 'warning', + op: 'gt', + fill: true, + line: true, + yaxis: 'left', + }, + { + value: critical, + colorMode: 'critical', + op: 'gt', + fill: true, + line: true, + yaxis: 'left', + }, + ], + }, +} diff --git a/mixin/thanos/README.md b/mixin/thanos/README.md new file mode 100644 index 0000000000..1472b88db8 --- /dev/null +++ b/mixin/thanos/README.md @@ -0,0 +1,158 @@ +# thanos-mixin + +> Note that everything is experimental and may change significantly at any time. +> Also it still has missing alert and dashboard definitions for certain components, e.g. rule and sidecar. Please feel free to contribute. + +This directory contains extensible and customizable monitoring definitons for Thanos. [Grafana](http://grafana.com/) dashboards, and [Prometheus rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) combined with documentation and scripts to provide easy monitoring experience for Thanos. + +You can find more about monitoring-mixins in [the design document](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit#heading=h.gt9r2h2gklj3), and you could check out other examples like [Prometheus Mixin](https://github.com/prometheus/prometheus/tree/master/documentation/prometheus-mixin). + +The content of this project is written in [jsonnet](http://jsonnet.org/). This project could both be described as a package as well as a library. + +## Requirements + +### jsonnet + +The content of this project consists of a set of [jsonnet](http://jsonnet.org/) files making up a library to be consumed. + +We recommend to use [go-jsonnet](https://github.com/google/go-jsonnet). It's an implementation of [Jsonnet](http://jsonnet.org/) in pure Go. It is feature complete but is not as heavily exercised as the [Jsonnet C++ implementation](https://github.com/google/jsonnet). + +To install: + +```shell +go get github.com/google/go-jsonnet/cmd/jsonnet +``` + +### jsonnet-bundler + +`thanos-mixin` uses [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler#install) (the jsonnet package manager) to manage its dependencies. + +We also recommend you to use `jsonnet-bundler` to install or update if you decide to use `thanos-mixin` as a dependency for your custom configurations. + +To install: + +```shell +go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` + +> An e.g. of how to install a given version of this library: `jb install github.com/thanos-io/thanos/jsonnet/thanos-mixin@master`. + +## Use as a library + +To use the `thanos-mixin` as a dependency, simply use the `jsonnet-bundler` install functionality: +```shell +$ mkdir thanos-mixin; cd thanos-mixin +$ jb init # Creates the initial/empty `jsonnetfile.json` +# Install the thanos-mixin dependency +$ jb install github.com/thanos-io/thanos/jsonnet/thanos-mixin@master # Creates `vendor/` & `jsonnetfile.lock.json`, and fills in `jsonnetfile.json` +``` + +To update the `thanos-mixin` as a dependency, simply use the `jsonnet-bundler` update functionality: +```shell +$ jb update +``` + +#### Configure + +This project is intended to be used as a library. You can extend and customize dashboards and alerting rules by creating for own generators, such as the generators ([alerts.jsonnet](alerts.jsonnet) and [dashboards.jsonnet](dashboards.jsonnet)) that are use to create [examples](examples). Default parameters are collected in [defaults.jsonnet](defaults.jsonnet), feel free to modify and generate your own definitons. + +[embedmd]:# (defaults.libsonnet) +```libsonnet +{ + querier+:: { + jobPrefix: 'thanos-querier', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sQuerier' % $.dashboard.prefix, + }, + store+:: { + jobPrefix: 'thanos-store', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sStore' % $.dashboard.prefix, + }, + receiver+:: { + jobPrefix: 'thanos-receiver', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sReceiver' % $.dashboard.prefix, + }, + ruler+:: { + jobPrefix: 'thanos-ruler', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sRuler' % $.dashboard.prefix, + }, + compactor+:: { + jobPrefix: 'thanos-compactor', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sCompactor' % $.dashboard.prefix, + }, + sidecar+:: { + jobPrefix: 'thanos-sidecar', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sSidecar' % $.dashboard.prefix, + }, + overview+:: { + title: '%(prefix)sOverview' % $.dashboard.prefix, + }, + dashboard+:: { + prefix: 'Thanos / ', + tags: ['thanos-mixin'], + namespaceQuery: 'kube_pod_info', + }, +} +``` + +You can format your code using: +```shell +$ make jsonnet-format +``` + +## Examples + +This project is intended to be used as a library. However, it also provides drop-in examples to monitor Thanos. + +### Requirements + +Although all the required dependencies are handled by `Makefile`, keep in mind that in addition the dependencies that are listed above we have following dependencies: + +#### gojsontoyaml + +`gojsontoyaml` is used to convert generated `json` definitons to `yaml`. + +To install: +```shell +go get github.com/brancz/gojsontoyaml +``` + +### Generate + +To generate examples after modifying, make sure `jsonnet` dependencies are installed. +```shell +$ make jsonnet-vendor +``` + +and then + +```shell +$ make examples +``` + +Make action runs the jsonnet code, then reads each key of the generated json and uses that as the file name, and writes the value of that key to that file, and converts each json manifest to yaml. + +> Make commands should handle dependecies for you. + +### Test and validate + +You validate your structural correctness of your Prometheus [alerting rules](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) with: + +```shell +$ make example-rules-lint +``` + +Check out [test.yaml](examples/alerts/tests.yaml) to add/modify tests for the mixin. To learn more about how to write test for Prometheus, check out [official documentation](https://www.prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/). + +You test alerts with: + +```shell +$ make alerts-test +``` + +--- diff --git a/mixin/thanos/alerts.jsonnet b/mixin/thanos/alerts.jsonnet new file mode 100644 index 0000000000..574da7f5a6 --- /dev/null +++ b/mixin/thanos/alerts.jsonnet @@ -0,0 +1,4 @@ +( + (import 'mixin.libsonnet') + + (import 'defaults.libsonnet') +).prometheusAlerts diff --git a/mixin/thanos/alerts/absent.libsonnet b/mixin/thanos/alerts/absent.libsonnet new file mode 100644 index 0000000000..a134fc8f60 --- /dev/null +++ b/mixin/thanos/alerts/absent.libsonnet @@ -0,0 +1,37 @@ +{ + local thanos = self, + + // We build alerts for the presence of all these jobs. + jobs:: { + ThanosQuerier: thanos.querier.selector, + ThanosStore: thanos.store.selector, + ThanosReceiver: thanos.receiver.selector, + ThanosRuler: thanos.ruler.selector, + ThanosCompactor: thanos.compactor.selector, + ThanosSidecar: thanos.sidecar.selector, + }, + + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-component-absent.rules', + rules: [ + { + alert: '%sIsDown' % name, + expr: ||| + absent(up{%s} == 1) + ||| % thanos.jobs[name], + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: '%s has disappeared from Prometheus target discovery.' % name, + }, + } + for name in std.objectFields(thanos.jobs) + ], + }, + ], + }, +} diff --git a/mixin/thanos/alerts/alerts.libsonnet b/mixin/thanos/alerts/alerts.libsonnet new file mode 100644 index 0000000000..0eb63dc98d --- /dev/null +++ b/mixin/thanos/alerts/alerts.libsonnet @@ -0,0 +1,6 @@ +(import 'compactor.libsonnet') + +(import 'querier.libsonnet') + +(import 'receiver.libsonnet') + +(import 'sidecar.libsonnet') + +(import 'store.libsonnet') + +(import 'absent.libsonnet') diff --git a/mixin/thanos/alerts/compactor.libsonnet b/mixin/thanos/alerts/compactor.libsonnet new file mode 100644 index 0000000000..6c34259b9b --- /dev/null +++ b/mixin/thanos/alerts/compactor.libsonnet @@ -0,0 +1,84 @@ +{ + local thanos = self, + compactor+:: { + jobPrefix: error 'must provide job prefix for Thanos Compact alerts', + selector: error 'must provide selector for Thanos Compact alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-compactor.rules', + rules: [ + { + alert: 'ThanosCompactorMultipleCompactsAreRunning', + annotations: { + message: 'You should never run more than one Thanos Compact at once. You have {{ $value }}', + }, + expr: 'sum(up{%(selector)s}) > 1' % thanos.compactor, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosCompactorHalted', + annotations: { + message: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.', + }, + expr: 'thanos_compactor_halted{%(selector)s} == 1' % thanos.compactor, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosCompactorHighCompactionFailures', + annotations: { + message: 'Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.compactor, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosCompactorBucketHighOperationFailures', + annotations: { + message: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.compactor, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosCompactorHasNotRun', + annotations: { + message: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.', + }, + expr: '(time() - max(thanos_objstore_bucket_last_successful_upload_time{%(selector)s})) / 60 / 60 > 24' % thanos.compactor, + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/alerts/querier.libsonnet b/mixin/thanos/alerts/querier.libsonnet new file mode 100644 index 0000000000..38705aacf3 --- /dev/null +++ b/mixin/thanos/alerts/querier.libsonnet @@ -0,0 +1,138 @@ +{ + local thanos = self, + querier+:: { + jobPrefix: error 'must provide job prefix for Thanos Query alerts', + selector: error 'must provide selector for Thanos Query alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-querier.rules', + rules: [ + { + alert: 'ThanosQuerierHttpRequestQueryErrorRateHigh', + annotations: { + message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.', + }, + expr: ||| + ( + sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m])) + / + sum(rate(http_requests_total{%(selector)s, handler="query"}[5m])) + ) * 100 > 5 + ||| % thanos.querier, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosQuerierHttpRequestQueryRangeErrorRateHigh', + annotations: { + message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.', + }, + expr: ||| + ( + sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m])) + / + sum(rate(http_requests_total{%(selector)s, handler="query_range"}[5m])) + ) * 100 > 5 + ||| % thanos.querier, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosQuerierGrpcServerErrorRate', + annotations: { + message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + / + sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.querier, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosQuerierGrpcClientErrorRate', + annotations: { + message: 'Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m])) + / + sum by (job) (rate(grpc_client_started_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.querier, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosQuerierHighDNSFailures', + annotations: { + message: 'Thanos Querys {{$labels.job}} have {{ $value }} of failing DNS queries.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) + > 1 + ) + ||| % thanos.querier, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosQuerierInstantLatencyHigh', + annotations: { + message: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.', + }, + expr: ||| + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{%(selector)s, handler="query"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0 + ) + ||| % thanos.querier, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosQuerierRangeLatencyHigh', + annotations: { + message: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.', + }, + expr: ||| + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{%(selector)s, handler="query_range"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0 + ) + ||| % thanos.querier, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/alerts/receiver.libsonnet b/mixin/thanos/alerts/receiver.libsonnet new file mode 100644 index 0000000000..9441bcaf41 --- /dev/null +++ b/mixin/thanos/alerts/receiver.libsonnet @@ -0,0 +1,97 @@ +{ + local thanos = self, + receiver+:: { + jobPrefix: error 'must provide job prefix for Thanos Receive alerts', + selector: error 'must provide selector for Thanos Receive alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-receiver.rules', + rules: [ + { + alert: 'ThanosReceiverHttpRequestErrorRateHigh', + annotations: { + message: 'Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m])) + / + sum(rate(http_requests_total{%(selector)s, handler="receive"}[5m])) + ) * 100 > 5 + ||| % thanos.receiver, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosReceiverHttpRequestLatencyHigh', + annotations: { + message: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.', + }, + expr: ||| + ( + histogram_quantile(0.99, sum by (job, le) (http_request_duration_seconds_bucket{%(selector)s, handler="receive"})) > 10 + and + sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0 + ) + ||| % thanos.receiver, + 'for': '10m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosReceiverHighForwardRequestFailures', + annotations: { + message: 'Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) + / + sum by (job) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.receiver, + 'for': '5m', + labels: { + severity: 'critical', + }, + }, + { + alert: 'ThanosReceiverHighHashringFileRefreshFailures', + annotations: { + message: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) + > 0 + ) + ||| % thanos.receiver, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosReceiverConfigReloadFailure', + annotations: { + message: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.', + }, + expr: 'avg(thanos_receive_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.receiver, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/alerts/sidecar.libsonnet b/mixin/thanos/alerts/sidecar.libsonnet new file mode 100644 index 0000000000..7c80e3af08 --- /dev/null +++ b/mixin/thanos/alerts/sidecar.libsonnet @@ -0,0 +1,28 @@ +{ + local thanos = self, + sidecar+:: { + jobPrefix: error 'must provide job prefix for Thanos Sidecar alerts', + selector: error 'must provide selector for Thanos Sidecar alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-sidecar.rules', + rules: [ + { + alert: 'ThanosSidecarUnhealthy', + annotations: { + message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', + }, + expr: ||| + count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 300) > 0 + ||| % thanos.sidecar, + labels: { + severity: 'critical', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/alerts/store.libsonnet b/mixin/thanos/alerts/store.libsonnet new file mode 100644 index 0000000000..77ff1baedd --- /dev/null +++ b/mixin/thanos/alerts/store.libsonnet @@ -0,0 +1,86 @@ +{ + local thanos = self, + store+:: { + jobPrefix: error 'must provide job prefix for Thanos Store alerts', + selector: error 'must provide selector for Thanos Store alerts', + }, + prometheusAlerts+:: { + groups+: [ + { + name: 'thanos-store.rules', + rules: [ + { + alert: 'ThanosStoreGrpcErrorRate', + annotations: { + message: 'Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + }, + expr: ||| + ( + sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) + / + sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.store, + 'for': '5m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosStoreSeriesGateLatencyHigh', + annotations: { + message: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.', + }, + expr: ||| + ( + histogram_quantile(0.9, sum by (job, le) (thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s})) > 2 + and + sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0 + ) + ||| % thanos.store, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosStoreBucketHighOperationFailures', + annotations: { + message: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + }, + expr: ||| + ( + sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + / + sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + * 100 > 5 + ) + ||| % thanos.store, + 'for': '15m', + labels: { + severity: 'warning', + }, + }, + { + alert: 'ThanosStoreObjstoreOperationLatencyHigh', + annotations: { + message: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.', + }, + expr: ||| + ( + histogram_quantile(0.9, sum by (job, le) (thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s})) > 15 + and + sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0 + ) + ||| % thanos.store, + 'for': '10m', + labels: { + severity: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/dashboards.jsonnet b/mixin/thanos/dashboards.jsonnet new file mode 100644 index 0000000000..a9cd0bbfcf --- /dev/null +++ b/mixin/thanos/dashboards.jsonnet @@ -0,0 +1,9 @@ +local dashboards = ( + (import 'mixin.libsonnet') + + (import 'defaults.libsonnet') +).grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/mixin/thanos/dashboards/compactor.libsonnet b/mixin/thanos/dashboards/compactor.libsonnet new file mode 100644 index 0000000000..79f594395a --- /dev/null +++ b/mixin/thanos/dashboards/compactor.libsonnet @@ -0,0 +1,165 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + compactor+:: { + jobPrefix: error 'must provide job prefix for Thanos Compact dashboard', + selector: error 'must provide selector for Thanos Compact dashboard', + title: error 'must provide title for Thanos Compact dashboard', + }, + grafanaDashboards+:: { + 'compactor.json': + g.dashboard(thanos.compactor.title) + .addRow( + g.row('Group Compaction') + .addPanel( + g.panel( + 'Rate', + 'Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.' + ) + + g.queryPanel( + 'sum(rate(thanos_compact_group_compactions_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, group)', + 'compaction {{job}} {{group}}' + ) + + g.stack + ) + .addPanel( + g.panel( + 'Errors', + 'Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.' + ) + + g.qpsErrTotalPanel( + 'thanos_compact_group_compactions_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_group_compactions_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) + .addRow( + g.row('Downsample') + .addPanel( + g.panel( + 'Rate', + 'Shows rate of execution for downsampling against blocks that are stored in the bucket by compaction group.' + ) + + g.queryPanel( + 'sum(rate(thanos_compact_downsample_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, group)', + 'downsample {{job}} {{group}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of executed downsampling against blocks that are stored in the bucket.') + + g.qpsErrTotalPanel( + 'thanos_compact_downsample_failed_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_downsample_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) + .addRow( + g.row('Garbage Collection') + .addPanel( + g.panel( + 'Rate', + 'Shows rate of execution for removals of blocks if their data is available as part of a block with a higher compaction level.' + ) + + g.queryPanel( + 'sum(rate(thanos_compact_garbage_collection_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'garbage collection {{job}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of executed garbage collections.') + + g.qpsErrTotalPanel( + 'thanos_compact_garbage_collection_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_garbage_collection_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to execute garbage collection in quantiles.') + + g.latencyPanel('thanos_compact_garbage_collection_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.row('Sync Meta') + .addPanel( + g.panel( + 'Rate', + 'Shows rate of execution for all meta files from blocks in the bucket into the memory.' + ) + + g.queryPanel( + 'sum(rate(thanos_compact_sync_meta_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'sync {{job}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of executed meta file sync.') + + g.qpsErrTotalPanel( + 'thanos_compact_sync_meta_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_compact_sync_meta_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to execute meta file sync, in quantiles.') + + g.latencyPanel('thanos_compact_sync_meta_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.row('Object Store Operations') + .addPanel( + g.panel('Rate', 'Shows rate of execution for operations against the bucket.') + + g.queryPanel( + 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + '{{job}} {{operation}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') + + g.qpsErrTotalPanel( + 'thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') + + g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.compactor, true, '%(jobPrefix)s.*' % thanos.compactor) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.compactor, true, '.*'), + + __overviewRows__+:: [ + g.row('Compact') + .addPanel( + g.panel( + 'Compaction Rate', + 'Shows rate of execution for compactions against blocks that are stored in the bucket by compaction group.' + ) + + g.queryPanel( + 'sum(rate(thanos_compact_group_compactions_total{namespace="$namespace",%(selector)s}[$interval])) by (job)' % thanos.compactor, + 'compaction {{job}}' + ) + + g.stack + + g.addDashboardLink(thanos.compactor.title) + ) + .addPanel( + g.panel( + 'Compaction Errors', + 'Shows ratio of errors compared to the total number of executed compactions against blocks that are stored in the bucket.' + ) + + g.qpsErrTotalPanel( + 'thanos_compact_group_compactions_failures_total{namespace="$namespace",%(selector)s}' % thanos.compactor, + 'thanos_compact_group_compactions_total{namespace="$namespace",%(selector)s}' % thanos.compactor, + ) + + g.addDashboardLink(thanos.compactor.title) + ) + + g.collapse, + ], + }, +} diff --git a/mixin/thanos/dashboards/dashboards.libsonnet b/mixin/thanos/dashboards/dashboards.libsonnet new file mode 100644 index 0000000000..f12f527103 --- /dev/null +++ b/mixin/thanos/dashboards/dashboards.libsonnet @@ -0,0 +1,8 @@ +(import 'querier.libsonnet') + +(import 'store.libsonnet') + +(import 'sidecar.libsonnet') + +(import 'receiver.libsonnet') + +(import 'ruler.libsonnet') + +(import 'compactor.libsonnet') + +(import 'overview.libsonnet') + +(import 'defaults.libsonnet') diff --git a/mixin/thanos/dashboards/defaults.libsonnet b/mixin/thanos/dashboards/defaults.libsonnet new file mode 100644 index 0000000000..510baf4ba7 --- /dev/null +++ b/mixin/thanos/dashboards/defaults.libsonnet @@ -0,0 +1,49 @@ +{ + local thanos = self, + local grafanaDashboards = super.grafanaDashboards, + local grafana = import 'grafonnet/grafana.libsonnet', + local template = grafana.template, + + dashboard:: { + prefix: 'Thanos / ', + tags: error 'must provide dashboard tags', + namespaceQuery: error 'must provide a query for namespace variable for dashboard template', + }, + + // Automatically add a uid to each dashboard based on the base64 encoding + // of the file name and set the timezone to be 'default'. + grafanaDashboards:: { + [filename]: grafanaDashboards[filename] { + uid: std.md5(filename), + timezone: '', + tags: thanos.dashboard.tags, + + // Modify tooltip to only show a single value + rows: [ + row { + panels: [ + panel { + tooltip+: { + shared: false, + }, + } + for panel in super.panels + ], + } + for row in super.rows + ], + + templating+: { + list+: [ + template.interval( + 'interval', + '5m,10m,30m,1h,6h,12h,auto', + label='interval', + current='5m', + ), + ], + }, + } + for filename in std.objectFields(grafanaDashboards) + }, +} diff --git a/mixin/thanos/dashboards/overview.libsonnet b/mixin/thanos/dashboards/overview.libsonnet new file mode 100644 index 0000000000..ad5328676e --- /dev/null +++ b/mixin/thanos/dashboards/overview.libsonnet @@ -0,0 +1,35 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + overview:: { + title: error 'must provide title for Thanos Overview dashboard', + }, + grafanaDashboards+:: { + 'overview.json': + g.dashboard(thanos.overview.title) + + g.template('namespace', thanos.dashboard.namespaceQuery), + }, +} + +{ + local grafanaDashboards = super.grafanaDashboards, + grafanaDashboards+:: { + 'overview.json'+: { + + __enumeratedRows__+:: std.foldl( + function(acc, row) + local n = std.length(row.panels); + local panelIndex = acc.counter; + local panels = std.makeArray( + n, function(i) + row.panels[i] { id: panelIndex + i } + ); + acc { counter:: acc.counter + n, rows+: [row { panels: panels }] }, + grafanaDashboards.__overviewRows__, + { counter:: 1, rows: [] } + ), + + rows+: self.__enumeratedRows__.rows, + }, + }, +} diff --git a/mixin/thanos/dashboards/querier.libsonnet b/mixin/thanos/dashboards/querier.libsonnet new file mode 100644 index 0000000000..a6fd758b23 --- /dev/null +++ b/mixin/thanos/dashboards/querier.libsonnet @@ -0,0 +1,193 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + querier+:: { + jobPrefix: error 'must provide job prefix for Thanos Query dashboard', + selector: error 'must provide selector for Thanos Query dashboard', + title: error 'must provide title for Thanos Query dashboard', + }, + grafanaDashboards+:: { + 'querier.json': + g.dashboard(thanos.querier.title) + .addRow( + g.row('Instant Query API') + .addPanel( + g.panel('Rate', 'Shows rate of requests against /query for the given time.') + + g.httpQpsPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query.') + + g.httpErrPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + + g.latencyPanel('http_request_duration_seconds', 'namespace="$namespace",job=~"$job",handler="query"') + ) + ) + .addRow( + g.row('Range Query API') + .addPanel( + g.panel('Rate', 'Shows rate of requests against /query_range for the given time range.') + + g.httpQpsPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query_range"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query_range.') + + g.httpErrPanel('http_requests_total', 'namespace="$namespace",job=~"$job",handler="query_range"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + + g.latencyPanel('http_request_duration_seconds', 'namespace="$namespace",job=~"$job",handler="query_range"') + ) + ) + .addRow( + g.row('Query Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of requests against /query for the given time, with handlers and codes.') + + g.httpQpsPanelDetailed('http_requests_total', 'namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests, in more detail.') + + g.httpErrDetailsPanel('http_requests_total', 'namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + + g.httpLatencyDetailsPanel('http_request_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Unary)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from other queriers.') + + g.grpcQpsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests from other queriers.') + + g.grpcErrorsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from other queriers, in quantiles.') + + g.grpcLatencyPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests, with grpc methods and codes from other queriers.') + + g.grpcQpsPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests from other queriers.') + + g.grpcErrDetailsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from other queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Stream)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from other queriers.') + + g.grpcQpsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests from other queriers.') + + g.grpcErrorsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from other queriers, in quantiles') + + g.grpcLatencyPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests, with grpc methods and codes.') + + g.grpcQpsPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the the total number of handled requests.') + + g.grpcErrDetailsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests in quantiles.') + + g.grpcLatencyPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + + g.collapse + ) + .addRow( + g.row('DNS') + .addPanel( + g.panel('Rate', 'Shows rate of DNS lookups to discover stores.') + + g.queryPanel( + 'sum(rate(thanos_querier_store_apis_dns_lookups_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'lookups {{job}}' + ) + ) + .addPanel( + g.panel('Errors', 'Shows ratio of failures compared to the the total number of executed DNS lookups.') + + g.qpsErrTotalPanel( + 'thanos_querier_store_apis_dns_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_querier_store_apis_dns_lookups_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.querier, true, '%(jobPrefix)s.*' % thanos.querier) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.querier, true, '.*'), + + __overviewRows__+:: [ + g.row('Instant Query') + .addPanel( + g.panel('Requests Rate', 'Shows rate of requests against /query for the given time.') + + g.httpQpsPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query"' % thanos.querier) + + g.addDashboardLink(thanos.querier.title) + ) + .addPanel( + g.panel('Requests Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query.') + + g.httpErrPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query"' % thanos.querier) + + g.addDashboardLink(thanos.querier.title) + ) + .addPanel( + g.sloLatency( + 'Latency 99th Percentile', + 'Shows how long has it taken to handle requests.', + 'http_request_duration_seconds_bucket{namespace="$namespace",%(selector)s,handler="query"}' % thanos.querier, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.querier.title) + ), + + g.row('Range Query') + .addPanel( + g.panel('Requests Rate', 'Shows rate of requests against /query_range for the given time range.') + + g.httpQpsPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query_range"' % thanos.querier) + + g.addDashboardLink(thanos.querier.title) + ) + .addPanel( + g.panel('Requests Errors', 'Shows ratio of errors compared to the the total number of handled requests against /query_range.') + + g.httpErrPanel('http_requests_total', 'namespace="$namespace",%(selector)s,handler="query_range"' % thanos.querier) + + g.addDashboardLink(thanos.querier.title) + ) + .addPanel( + g.sloLatency( + 'Latency 99th Percentile', + 'Shows how long has it taken to handle requests.', + 'http_request_duration_seconds_bucket{namespace="$namespace",%(selector)s,handler="query_range"}' % thanos.querier, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.querier.title) + ), + ], + }, +} diff --git a/mixin/thanos/dashboards/receiver.libsonnet b/mixin/thanos/dashboards/receiver.libsonnet new file mode 100644 index 0000000000..3dddc94e76 --- /dev/null +++ b/mixin/thanos/dashboards/receiver.libsonnet @@ -0,0 +1,171 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + receiver+:: { + jobPrefix: error 'must provide job prefix for Thanos Receive dashboard', + selector: error 'must provide selector for Thanos Receive dashboard', + title: error 'must provide title for Thanos Receive dashboard', + }, + grafanaDashboards+:: { + 'receiver.json': + g.dashboard(thanos.receiver.title) + .addRow( + g.row('Incoming Request') + .addPanel( + g.panel('Rate', 'Shows rate of incoming requests.') + + g.httpQpsPanel('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + + g.httpErrPanel('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle incoming requests in quantiles.') + + g.latencyPanel('http_request_duration_seconds', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of incoming requests.') + + g.httpQpsPanelDetailed('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + + g.httpErrDetailsPanel('http_requests_total', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle incoming requests in quantiles.') + + g.httpLatencyDetailsPanel('http_request_duration_seconds', 'handler="receive",namespace="$namespace",job=~"$job"') + ) + + g.collapse + ) + .addRow( + g.row('Forward Request') + .addPanel( + g.panel('Rate', 'Shows rate of forwarded requests to other receive nodes.') + + g.queryPanel( + 'sum(rate(thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'all {{job}}', + ) + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.') + + g.qpsErrTotalPanel( + 'thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job",result="error"}', + 'thanos_receive_forward_requests_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) + .addRow( + g.row('gRPC (Unary)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Stream)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + + g.collapse + ) + .addRow( + g.row('Last Updated') + .addPanel( + g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') + + g.tablePanel( + ['time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace="$namespace",job=~"$job"}) by (job, bucket)'], + { + Value: { + alias: 'Uploaded Ago', + unit: 's', + type: 'number', + }, + }, + ) + ) + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.receiver, true, '%(jobPrefix)s.*' % thanos.receiver) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.receiver, true, '.*'), + + __overviewRows__+:: [ + g.row('Receive') + .addPanel( + g.panel('Incoming Requests Rate', 'Shows rate of incoming requests.') + + g.httpQpsPanel('http_requests_total', 'handler="receive",namespace="$namespace",%(selector)s' % thanos.receiver) + + g.addDashboardLink(thanos.receiver.title) + ) + .addPanel( + g.panel('Incoming Requests Errors', 'Shows ratio of errors compared to the total number of handled incoming requests.') + + g.httpErrPanel('http_requests_total', 'handler="receive",namespace="$namespace",%(selector)s' % thanos.receiver) + + g.addDashboardLink(thanos.receiver.title) + ) + .addPanel( + g.sloLatency( + 'Incoming Requests Latency 99th Percentile', + 'Shows how long has it taken to handle incoming requests.', + 'http_request_duration_seconds_bucket{handler="receive",namespace="$namespace",%(selector)s}' % thanos.receiver, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.receiver.title) + ), + ], + }, +} diff --git a/mixin/thanos/dashboards/ruler.libsonnet b/mixin/thanos/dashboards/ruler.libsonnet new file mode 100644 index 0000000000..7c03a88658 --- /dev/null +++ b/mixin/thanos/dashboards/ruler.libsonnet @@ -0,0 +1,144 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + ruler+:: { + jobPrefix: error 'must provide job prefix for Thanos Ruler dashboard', + selector: error 'must provide selector for Thanos Ruler dashboard', + title: error 'must provide title for Thanos Ruler dashboard', + }, + grafanaDashboards+:: { + 'ruler.json': + g.dashboard(thanos.ruler.title) + .addRow( + g.row('Alert Sent') + .addPanel( + g.panel('Dropped Rate', 'Shows rate of dropped alerts.') + + g.queryPanel( + 'sum(rate(thanos_alert_sender_alerts_dropped_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, alertmanager)', + '{{job}} {{alertmanager}}' + ) + ) + .addPanel( + g.panel('Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + + g.queryPanel( + 'sum(rate(thanos_alert_sender_alerts_sent_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, alertmanager)', + '{{job}} {{alertmanager}}' + ) + + g.stack + ) + .addPanel( + g.panel('Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + + g.qpsErrTotalPanel( + 'thanos_alert_sender_errors_total{namespace="$namespace",job=~"$job"}', + 'thanos_alert_sender_alerts_sent_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Sent Duration', 'Shows how long has it taken to send alerts to alert manager.') + + g.latencyPanel('thanos_alert_sender_latency_seconds', 'namespace="$namespace",job=~"$job"'), + ) + ) + .addRow( + g.row('gRPC (Unary)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles.') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Stream)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests, in quantiles') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + + g.collapse + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.ruler, true, '%(jobPrefix)s.*' % thanos.ruler) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.ruler, true, '.*'), + + __overviewRows__+:: [ + g.row('Rule') + .addPanel( + g.panel('Alert Sent Rate', 'Shows rate of alerts that successfully sent to alert manager.') + + g.queryPanel( + 'sum(rate(thanos_alert_sender_alerts_sent_total{namespace="$namespace",%(selector)s}[$interval])) by (job, alertmanager)' % thanos.ruler, + '{{job}} {{alertmanager}}' + ) + + g.addDashboardLink(thanos.ruler.title) + + g.stack + ) + .addPanel( + g.panel('Alert Sent Errors', 'Shows ratio of errors compared to the total number of sent alerts.') + + g.qpsErrTotalPanel( + 'thanos_alert_sender_errors_total{namespace="$namespace",%(selector)s}' % thanos.ruler, + 'thanos_alert_sender_alerts_sent_total{namespace="$namespace",%(selector)s}' % thanos.ruler, + ) + + g.addDashboardLink(thanos.ruler.title) + ) + .addPanel( + g.sloLatency( + 'Alert Sent Duration', + 'Shows how long has it taken to send alerts to alert manager.', + 'thanos_alert_sender_latency_seconds_bucket{namespace="$namespace",%(selector)s}' % thanos.ruler, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.ruler.title) + ) + + g.collapse, + ], + }, +} diff --git a/mixin/thanos/dashboards/sidecar.libsonnet b/mixin/thanos/dashboards/sidecar.libsonnet new file mode 100644 index 0000000000..ef6cae0cae --- /dev/null +++ b/mixin/thanos/dashboards/sidecar.libsonnet @@ -0,0 +1,145 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + sidecar+:: { + jobPrefix: error 'must provide job prefix for Thanos Sidecar dashboard', + selector: error 'must provide selector for Thanos Sidecar dashboard', + title: error 'must provide title for Thanos Sidecar dashboard', + }, + grafanaDashboards+:: { + 'sidecar.json': + g.dashboard(thanos.sidecar.title) + .addRow( + g.row('gRPC (Unary)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Stream)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + + g.collapse + ) + .addRow( + g.row('Last Updated') + .addPanel( + g.panel('Successful Upload', 'Shows the relative time of last successful upload to the object-store bucket.') + + g.tablePanel( + ['time() - max(thanos_objstore_bucket_last_successful_upload_time{namespace="$namespace",job=~"$job"}) by (job, bucket)'], + { + Value: { + alias: 'Uploaded Ago', + unit: 's', + type: 'number', + }, + }, + ) + ) + ) + .addRow( + g.row('Bucket Operations') + .addPanel( + g.panel('Rate') + + g.queryPanel( + 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + '{{job}} {{operation}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors') + + g.qpsErrTotalPanel( + 'thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Duration') + + g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.sidecar, true, '%(jobPrefix)s.*' % thanos.sidecar) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.sidecar, true, '.*'), + + __overviewRows__+:: [ + g.row('Sidecar') + .addPanel( + g.panel('gPRC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.sidecar) + + g.addDashboardLink(thanos.sidecar.title) + ) + .addPanel( + g.panel('gPRC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.sidecar) + + g.addDashboardLink(thanos.sidecar.title) + ) + .addPanel( + g.sloLatency( + 'gPRC (Unary) Latency 99th Percentile', + 'Shows how long has it taken to handle requests from queriers, in quantiles.', + 'grpc_server_handling_seconds_bucket{grpc_type="unary",namespace="$namespace",%(selector)s}' % thanos.sidecar, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.sidecar.title) + ), + ], + }, +} diff --git a/mixin/thanos/dashboards/store.libsonnet b/mixin/thanos/dashboards/store.libsonnet new file mode 100644 index 0000000000..e29ca992e2 --- /dev/null +++ b/mixin/thanos/dashboards/store.libsonnet @@ -0,0 +1,276 @@ +local g = import '../thanos-grafana-builder/builder.libsonnet'; + +{ + local thanos = self, + store+:: { + jobPrefix: error 'must provide job prefix for Thanos Store dashboard', + selector: error 'must provide selector for Thanos Store dashboard', + title: error 'must provide title for Thanos Store dashboard', + }, + grafanaDashboards+:: { + 'store.json': + g.dashboard(thanos.store.title) + .addRow( + g.row('gRPC (Unary)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('server', 'namespace="$namespace",job=~"$job",grpc_type="unary"') + ) + + g.collapse + ) + .addRow( + g.row('gRPC (Stream)') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanel('server', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + ) + .addRow( + g.row('Detailed') + .addPanel( + g.panel('Rate', 'Shows rate of handled Streamed gRPC requests from queriers.') + + g.grpcQpsPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrDetailsPanel('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to handle requests from queriers, in quantiles.') + + g.grpcLatencyPanelDetailed('client', 'namespace="$namespace",job=~"$job",grpc_type="server_stream"') + ) + + g.collapse + ) + .addRow( + g.row('Bucket Operations') + .addPanel( + g.panel('Rate', 'Shows rate of execution for operations against the bucket.') + + g.queryPanel( + 'sum(rate(thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + '{{job}} {{operation}}' + ) + + g.stack + ) + .addPanel( + g.panel('Errors', 'Shows ratio of errors compared to the total number of executed operations against the bucket.') + + g.qpsErrTotalPanel( + 'thanos_objstore_bucket_operation_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_objstore_bucket_operations_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Duration', 'Shows how long has it taken to execute operations against the bucket, in quantiles.') + + g.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.row('Block Operations') + .addPanel( + g.panel('Block Load Rate', 'Shows rate of block loads from the bucket.') + + g.queryPanel( + 'sum(rate(thanos_bucket_store_block_loads_total{namespace="$namespace",job=~"$job"}[$interval]))', + 'block loads' + ) + + g.stack + ) + .addPanel( + g.panel('Block Load Errors', 'Shows ratio of errors compared to the total number of block loads from the bucket.') + + g.qpsErrTotalPanel( + 'thanos_bucket_store_block_load_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_bucket_store_block_loads_total{namespace="$namespace",job=~"$job"}', + ) + ) + .addPanel( + g.panel('Block Drop Rate', 'Shows rate of block drops.') + + g.queryPanel( + 'sum(rate(thanos_bucket_store_block_drops_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, operation)', + 'block drops {{job}}' + ) + + g.stack + ) + .addPanel( + g.panel('Block Drop Errors', 'Shows ratio of errors compared to the total number of block drops.') + + g.qpsErrTotalPanel( + 'thanos_bucket_store_block_drop_failures_total{namespace="$namespace",job=~"$job"}', + 'thanos_bucket_store_block_drops_total{namespace="$namespace",job=~"$job"}', + ) + ) + ) + .addRow( + g.row('Cache Operations') + .addPanel( + g.panel('Requests', 'Show rate of cache requests.') + + g.queryPanel( + 'sum(rate(thanos_store_index_cache_requests_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + '{{job}} {{item_type}}', + ) + + g.stack + ) + .addPanel( + g.panel('Hits', 'Shows ratio of errors compared to the total number of cache hits.') + + g.queryPanel( + 'sum(rate(thanos_store_index_cache_hits_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + '{{job}} {{item_type}}', + ) + + g.stack + ) + .addPanel( + g.panel('Added', 'Show rate of added items to cache.') + + g.queryPanel( + 'sum(rate(thanos_store_index_cache_items_added_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + '{{job}} {{item_type}}', + ) + + g.stack + ) + .addPanel( + g.panel('Evicted', 'Show rate of evicted items from cache.') + + g.queryPanel( + 'sum(rate(thanos_store_index_cache_items_evicted_total{namespace="$namespace",job=~"$job"}[$interval])) by (job, item_type)', + '{{job}} {{item_type}}', + ) + + g.stack + ) + ) + .addRow( + g.row('Store Sent') + .addPanel( + g.panel('Chunk Size', 'Shows size of chunks that have sent to the bucket, in bytes.') + + g.queryPanel( + [ + 'histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace="$namespace",job=~"$job"}[$interval])) by (job, le))', + 'sum(rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'histogram_quantile(0.99, sum(rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace="$namespace",job=~"$job"}[$interval])) by (job, le))', + ], + [ + 'P99', + 'mean', + 'P50', + ], + ) + ) + + { yaxes: g.yaxes('decbytes') }, + ) + .addRow( + g.row('Series Operations') + .addPanel( + g.panel('Block queried') + + g.queryPanel( + [ + 'thanos_bucket_store_series_blocks_queried{namespace="$namespace",job=~"$job",quantile="0.99"}', + 'sum(rate(thanos_bucket_store_series_blocks_queried_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_blocks_queried_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'thanos_bucket_store_series_blocks_queried{namespace="$namespace",job=~"$job",quantile="0.50"}', + ], [ + 'P99', + 'mean {{job}}', + 'P50', + ], + ) + ) + .addPanel( + g.panel('Data Fetched') + + g.queryPanel( + [ + 'thanos_bucket_store_series_data_fetched{namespace="$namespace",job=~"$job",quantile="0.99"}', + 'sum(rate(thanos_bucket_store_series_data_fetched_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_data_fetched_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'thanos_bucket_store_series_data_fetched{namespace="$namespace",job=~"$job",quantile="0.50"}', + ], [ + 'P99', + 'mean {{job}}', + 'P50', + ], + ) + ) + .addPanel( + g.panel('Result series') + + g.queryPanel( + [ + 'thanos_bucket_store_series_result_series{namespace="$namespace",job=~"$job",quantile="0.99"}', + 'sum(rate(thanos_bucket_store_series_result_series_sum{namespace="$namespace",job=~"$job"}[$interval])) by (job) / sum(rate(thanos_bucket_store_series_result_series_count{namespace="$namespace",job=~"$job"}[$interval])) by (job)', + 'thanos_bucket_store_series_result_series{namespace="$namespace",job=~"$job",quantile="0.50"}', + ], [ + 'P99', + 'mean {{job}}', + 'P50', + ], + ) + ) + ) + .addRow( + g.row('Series Operation Durations') + .addPanel( + g.panel('Get All', 'Shows how long has it taken to get all series.') + + g.latencyPanel('thanos_bucket_store_series_get_all_duration_seconds', 'namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Merge', 'Shows how long has it taken to merge series.') + + g.latencyPanel('thanos_bucket_store_series_merge_duration_seconds_bucket', 'namespace="$namespace",job=~"$job"') + ) + .addPanel( + g.panel('Gate', 'Shows how long has it taken for a series to wait at the gate.') + + g.latencyPanel('thanos_bucket_store_series_gate_duration_seconds_bucket', 'namespace="$namespace",job=~"$job"') + ) + ) + .addRow( + g.resourceUtilizationRow() + ) + + g.template('namespace', thanos.dashboard.namespaceQuery) + + g.template('job', 'up', 'namespace="$namespace",%(selector)s' % thanos.store, true, '%(jobPrefix)s.*' % thanos.store) + + g.template('pod', 'kube_pod_info', 'namespace="$namespace",created_by_name=~"%(jobPrefix)s.*"' % thanos.store, true, '.*'), + + __overviewRows__+:: [ + g.row('Store') + .addPanel( + g.panel('gPRC (Unary) Rate', 'Shows rate of handled Unary gRPC requests from queriers.') + + g.grpcQpsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.store) + + g.addDashboardLink(thanos.store.title) + ) + .addPanel( + g.panel('gPRC (Unary) Errors', 'Shows ratio of errors compared to the total number of handled requests from queriers.') + + g.grpcErrorsPanel('server', 'namespace="$namespace",%(selector)s,grpc_type="unary"' % thanos.store) + + g.addDashboardLink(thanos.store.title) + ) + .addPanel( + g.sloLatency( + 'gRPC Latency 99th Percentile', + 'Shows how long has it taken to handle requests from queriers.', + 'grpc_server_handling_seconds_bucket{grpc_type="unary",namespace="$namespace",%(selector)s}' % thanos.store, + 0.99, + 0.5, + 1 + ) + + g.addDashboardLink(thanos.store.title) + ), + ], + }, +} diff --git a/mixin/thanos/defaults.libsonnet b/mixin/thanos/defaults.libsonnet new file mode 100644 index 0000000000..a6cf5e9cdc --- /dev/null +++ b/mixin/thanos/defaults.libsonnet @@ -0,0 +1,40 @@ +{ + querier+:: { + jobPrefix: 'thanos-querier', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sQuerier' % $.dashboard.prefix, + }, + store+:: { + jobPrefix: 'thanos-store', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sStore' % $.dashboard.prefix, + }, + receiver+:: { + jobPrefix: 'thanos-receiver', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sReceiver' % $.dashboard.prefix, + }, + ruler+:: { + jobPrefix: 'thanos-ruler', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sRuler' % $.dashboard.prefix, + }, + compactor+:: { + jobPrefix: 'thanos-compactor', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sCompactor' % $.dashboard.prefix, + }, + sidecar+:: { + jobPrefix: 'thanos-sidecar', + selector: 'job=~"%s.*"' % self.jobPrefix, + title: '%(prefix)sSidecar' % $.dashboard.prefix, + }, + overview+:: { + title: '%(prefix)sOverview' % $.dashboard.prefix, + }, + dashboard+:: { + prefix: 'Thanos / ', + tags: ['thanos-mixin'], + namespaceQuery: 'kube_pod_info', + }, +} diff --git a/mixin/thanos/mixin.libsonnet b/mixin/thanos/mixin.libsonnet new file mode 100644 index 0000000000..6590c396e4 --- /dev/null +++ b/mixin/thanos/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards/dashboards.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/mixin/thanos/rules.jsonnet b/mixin/thanos/rules.jsonnet new file mode 100644 index 0000000000..c50930e3b9 --- /dev/null +++ b/mixin/thanos/rules.jsonnet @@ -0,0 +1,4 @@ +( + (import 'mixin.libsonnet') + + (import 'defaults.libsonnet') +).prometheusRules diff --git a/mixin/thanos/rules/querier.libsonnet b/mixin/thanos/rules/querier.libsonnet new file mode 100644 index 0000000000..22706cf0e5 --- /dev/null +++ b/mixin/thanos/rules/querier.libsonnet @@ -0,0 +1,73 @@ +{ + local thanos = self, + querier+:: { + selector: error 'must provide selector for Thanos Query recording rules', + }, + prometheusRules+:: { + groups+: [ + { + name: 'thanos-querier.rules', + rules: [ + { + record: ':grpc_client_failures_per_unary:sum_rate', + expr: ||| + ( + sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) + / + sum(rate(grpc_client_started_total{%(selector)s, grpc_type="unary"}[5m])) + ) + ||| % thanos.querier, + labels: { + }, + }, + { + record: ':grpc_client_failures_per_stream:sum_rate', + expr: ||| + ( + sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) + / + sum(rate(grpc_client_started_total{%(selector)s, grpc_type="server_stream"}[5m])) + ) + ||| % thanos.querier, + labels: { + }, + }, + { + record: ':thanos_querier_store_apis_dns_failures_per_lookup:sum_rate', + expr: ||| + ( + sum(rate(thanos_querier_store_apis_dns_failures_total{%(selector)s}[5m])) + / + sum(rate(thanos_querier_store_apis_dns_lookups_total{%(selector)s}[5m])) + ) + ||| % thanos.querier, + labels: { + }, + }, + { + record: ':query_duration_seconds:histogram_quantile', + expr: ||| + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) by (le) + ) + ||| % thanos.querier, + labels: { + quantile: '0.99', + }, + }, + { + record: ':api_range_query_duration_seconds:histogram_quantile', + expr: ||| + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m])) by (le) + ) + ||| % thanos.querier, + labels: { + quantile: '0.99', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/rules/receiver.libsonnet b/mixin/thanos/rules/receiver.libsonnet new file mode 100644 index 0000000000..fc01667e58 --- /dev/null +++ b/mixin/thanos/rules/receiver.libsonnet @@ -0,0 +1,86 @@ +{ + local thanos = self, + receiver+:: { + selector: error 'must provide selector for Thanos Receive recording rules', + }, + prometheusRules+:: { + groups+: [ + { + name: 'thanos-receiver.rules', + rules: [ + { + record: ':grpc_server_failures_per_unary:sum_rate', + expr: ||| + sum( + rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m]) + / + rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m]) + ) + ||| % thanos.receiver, + labels: { + }, + }, + { + record: ':grpc_server_failures_per_stream:sum_rate', + expr: ||| + sum( + rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m]) + / + rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m]) + ) + ||| % thanos.receiver, + labels: { + }, + }, + { + record: ':http_failure_per_request:sum_rate', + expr: ||| + sum( + rate(http_requests_total{handler="receive", %(selector)s, code!~"5.."}[5m]) + / + rate(http_requests_total{handler="receive", %(selector)s}[5m]) + ) + ||| % thanos.receiver, + labels: { + }, + }, + { + record: ':http_request_duration_seconds:histogram_quantile', + expr: ||| + histogram_quantile(0.99, + sum(rate(http_request_duration_seconds_bucket{handler="receive", %(selector)s}[5m])) by (le) + ) + ||| % thanos.receiver, + labels: { + quantile: '0.99', + }, + }, + { + record: ':thanos_receive_forward_failure_per_requests:sum_rate', + expr: ||| + ( + sum(rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) + / + sum(rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) + ) + ||| % thanos.receiver, + labels: { + }, + }, + { + record: ':thanos_receive_hashring_file_failure_per_refresh:sum_rate', + expr: ||| + ( + sum(rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) + / + sum(rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) + ) + ||| % thanos.receiver, + labels: { + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/rules/rules.libsonnet b/mixin/thanos/rules/rules.libsonnet new file mode 100644 index 0000000000..656b267b4f --- /dev/null +++ b/mixin/thanos/rules/rules.libsonnet @@ -0,0 +1,3 @@ +(import 'querier.libsonnet') + +(import 'receiver.libsonnet') + +(import 'store.libsonnet') diff --git a/mixin/thanos/rules/store.libsonnet b/mixin/thanos/rules/store.libsonnet new file mode 100644 index 0000000000..946df7bb16 --- /dev/null +++ b/mixin/thanos/rules/store.libsonnet @@ -0,0 +1,62 @@ +{ + local thanos = self, + store+:: { + selector: error 'must provide selector for Thanos Store recording rules', + }, + prometheusRules+:: { + groups+: [ + { + name: 'thanos-store.rules', + rules: [ + { + record: ':grpc_server_failures_per_unary:sum_rate', + expr: ||| + ( + sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="unary"}[5m])) + / + sum(rate(grpc_server_started_total{%(selector)s, grpc_type="unary"}[5m])) + ) + ||| % thanos.store, + labels: { + }, + }, + { + record: ':grpc_server_failures_per_stream:sum_rate', + expr: ||| + ( + sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s, grpc_type="server_stream"}[5m])) + / + sum(rate(grpc_server_started_total{%(selector)s, grpc_type="server_stream"}[5m])) + ) + ||| % thanos.store, + labels: { + }, + }, + { + record: ':thanos_objstore_bucket_failures_per_operation:sum_rate', + expr: ||| + ( + sum(rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) + / + sum(rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) + ) + ||| % thanos.store, + labels: { + }, + }, + { + record: ':thanos_objstore_bucket_operation_duration_seconds:histogram_quantile', + expr: ||| + histogram_quantile(0.99, + sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m])) by (le) + ) + ||| % thanos.store, + labels: { + quantile: '0.99', + }, + }, + ], + }, + ], + }, +} diff --git a/mixin/thanos/separated_alerts.jsonnet b/mixin/thanos/separated_alerts.jsonnet new file mode 100644 index 0000000000..79402a0b3f --- /dev/null +++ b/mixin/thanos/separated_alerts.jsonnet @@ -0,0 +1,7 @@ +{ + [group.name]: group + for group in ( + (import 'mixin.libsonnet') + + (import 'defaults.libsonnet') + ).prometheusAlerts.groups +}