From e29882267398e50c656b6f69d78a6fc136360647 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 11 Jul 2022 17:54:29 +0200 Subject: [PATCH] [receive] Add per-tenant charts to Receive's example dashboard (#5472) * Start to add tenant charts to Receive Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly filter HTTP status codes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix tenant error rate chart Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor to improve readability and consistency Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor one more usage of code and tenant labels Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Filter tenant metrics to the Receive handler Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Format math expression properly Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update CHANGELOG Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add samples charts to series & samples row Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 1 + examples/dashboards/receive.json | 853 ++++++++++++++++++++++++++++- mixin/dashboards/receive.libsonnet | 141 ++++- 3 files changed, 977 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba614260b4..fa61f45647 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5440](https://github.com/thanos-io/thanos/pull/5440) HTTP metrics: export number of in-flight HTTP requests. - [#5424](https://github.com/thanos-io/thanos/pull/5424) Receive: Export metrics regarding size of remote write requests. - [#5420](https://github.com/thanos-io/thanos/pull/5420) Receive: Automatically remove stale tenants. +- [#5472](https://github.com/thanos-io/thanos/pull/5472) Receive: add new tenant metrics to example dashboard. ### Changed diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index a103ea8d9b..01352c42f9 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -314,6 +314,802 @@ "title": "WRITE - Incoming Request", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of write requests (by tenant and code)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Number of errors (by tenant and code)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_duration_seconds_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval])) / sum by (job, tenant) (http_request_duration_seconds_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average request duration (by tenant)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Incoming Request (tenant focus)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average successful HTTP request size (per tenant and code, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average failed HTTP request size (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant, method) (http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Inflight requests (per tenant and method)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "HTTP requests (tenant focus)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series & Samples (tenant focus)", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -326,7 +1122,7 @@ "datasource": "$datasource", "description": "Shows rate of replications to other receive nodes.", "fill": 1, - "id": 4, + "id": 14, "legend": { "avg": false, "current": false, @@ -405,7 +1201,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of replications to other receive nodes.", "fill": 10, - "id": 5, + "id": 15, "legend": { "avg": false, "current": false, @@ -493,7 +1289,7 @@ "datasource": "$datasource", "description": "Shows rate of forwarded requests to other receive nodes.", "fill": 1, - "id": 6, + "id": 16, "legend": { "avg": false, "current": false, @@ -572,7 +1368,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", "fill": 10, - "id": 7, + "id": 17, "legend": { "avg": false, "current": false, @@ -660,7 +1456,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 8, + "id": 18, "legend": { "avg": false, "current": false, @@ -811,7 +1607,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 9, + "id": 19, "legend": { "avg": false, "current": false, @@ -887,7 +1683,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 10, + "id": 20, "legend": { "avg": false, "current": false, @@ -1017,7 +1813,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 11, + "id": 21, "legend": { "avg": false, "current": false, @@ -1168,7 +1964,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 12, + "id": 22, "legend": { "avg": false, "current": false, @@ -1244,7 +2040,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 13, + "id": 23, "legend": { "avg": false, "current": false, @@ -1374,7 +2170,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests from queriers.", "fill": 10, - "id": 14, + "id": 24, "legend": { "avg": false, "current": false, @@ -1525,7 +2321,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 15, + "id": 25, "legend": { "avg": false, "current": false, @@ -1601,7 +2397,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 16, + "id": 26, "legend": { "avg": false, "current": false, @@ -1731,7 +2527,7 @@ "datasource": "$datasource", "description": "Shows the relative time of last successful upload to the object-store bucket.", "fill": 1, - "id": 17, + "id": 27, "legend": { "avg": false, "current": false, @@ -1855,7 +2651,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 28, "legend": { "avg": false, "current": false, @@ -1971,7 +2767,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 19, + "id": 29, "legend": { "avg": false, "current": false, @@ -2047,7 +2843,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 20, + "id": 30, "legend": { "avg": false, "current": false, @@ -2146,6 +2942,29 @@ "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "tenant", + "multi": false, + "name": "tenant", + "options": [ ], + "query": "label_values(http_requests_total{job=~\"$job\", tenant!=\"\"}, tenant)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "auto": true, "auto_count": 300, diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 8e3134fccf..e8bbe8ceda 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -1,6 +1,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; local utils = import '../lib/utils.libsonnet'; + { local thanos = self, receive+:: { @@ -9,15 +10,41 @@ local utils = import '../lib/utils.libsonnet'; dashboard:: { selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + tenantSelector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"', 'tenant=~"$tenant"']), + tenantDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'tenant']), }, }, grafanaDashboards+:: { + local grafana = import 'grafonnet/grafana.libsonnet', + local template = grafana.template, [if thanos.receive != null then 'receive.json']: local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']); local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); - g.dashboard(thanos.receive.title) + + local tenantReceiveHandlerSeclector = utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'handler="receive"']); + local tenantHttpCode2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code=~"2.."']); + local tenantHttpCodeNot2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code!~"2.."']); + + local tenantWithHttpCodeDimensions = std.join(', ', ['tenant', 'code']); + g.dashboard(thanos.receive.title) { + templating+: { + list+: [ + template.new( + 'tenant', + '$datasource', + 'label_values(http_requests_total{%s}, %s)' % [std.join(', ', [thanos.receive.dashboard.selector] + ['tenant!=""']), 'tenant'], + label='tenant', + refresh=1, + sort=2, + current='all', + allValues=null, + includeAll=true + ), + ], + }, + } .addRow( g.row('WRITE - Incoming Request') .addPanel( @@ -33,6 +60,118 @@ local utils = import '../lib/utils.libsonnet'; g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) ) + .addRow( + g.row('WRITE - Incoming Request (tenant focus)') + .addPanel( + g.panel('Rate of write requests (by tenant and code)') + + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [tenantWithHttpCodeDimensions, tenantReceiveHandlerSeclector], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Number of errors (by tenant and code)') + + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [ + tenantWithHttpCodeDimensions, + tenantHttpCodeNot2XXSelector, + ], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Average request duration (by tenant)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [ + thanos.receive.dashboard.tenantDimensions, + tenantReceiveHandlerSeclector, + thanos.receive.dashboard.tenantDimensions, + tenantReceiveHandlerSeclector, + ], + '{{tenant}}' + ) + ) + ) + .addRow( + g.row('HTTP requests (tenant focus)') + .addPanel( + g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Inflight requests (per tenant and method)') + + g.queryPanel( + 'sum by (%s) (http_inflight_requests{%s})' % [ + std.join(', ', [thanos.receive.dashboard.tenantDimensions, 'method']), + tenantReceiveHandlerSeclector, + ], + '{{method}} - {{tenant}}' + ) + ) + ) + .addRow( + g.row('Series & Samples (tenant focus)') + .addPanel( + g.panel('Rate of series received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), + thanos.receive.dashboard.tenantDimensions, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of series not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, + ], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of samples received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), + thanos.receive.dashboard.tenantDimensions, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of samples not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, + ], + '{{code}} - {{tenant}}' + ) + ) + ) .addRow( g.row('WRITE - Replication') .addPanel(