From 929243632032d6889176a6adcb0b0cf4a4492e42 Mon Sep 17 00:00:00 2001 From: Matej Gera Date: Mon, 4 Jul 2022 17:29:55 +0200 Subject: [PATCH 1/6] Refactor endpoint ref clients Signed-off-by: Matej Gera --- pkg/query/endpointset.go | 93 +++++++++-------------------------- pkg/query/endpointset_test.go | 1 - 2 files changed, 23 insertions(+), 71 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 048c7c14369..c200eaaf78b 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -61,17 +61,17 @@ func (es *GRPCEndpointSpec) Addr() string { // Metadata method for gRPC endpoint tries to call InfoAPI exposed by Thanos components until context timeout. If we are unable to get metadata after // that time, we assume that the host is unhealthy and return error. -func (es *GRPCEndpointSpec) Metadata(ctx context.Context, client *endpointClients) (*endpointMetadata, error) { - if client.info != nil { - resp, err := client.info.Info(ctx, &infopb.InfoRequest{}, grpc.WaitForReady(true)) +func (es *GRPCEndpointSpec) Metadata(ctx context.Context, infoClient infopb.InfoClient, storeClient storepb.StoreClient) (*endpointMetadata, error) { + if infoClient != nil { + resp, err := infoClient.Info(ctx, &infopb.InfoRequest{}, grpc.WaitForReady(true)) if err == nil { return &endpointMetadata{resp}, nil } } // Call Info method of StoreAPI, this way querier will be able to discovery old components not exposing InfoAPI. - if client.store != nil { - metadata, err := es.getMetadataUsingStoreAPI(ctx, client.store) + if storeClient != nil { + metadata, err := es.getMetadataUsingStoreAPI(ctx, storeClient) if err != nil { return nil, errors.Wrapf(err, "fallback fetching info from %s", es.addr) } @@ -368,7 +368,12 @@ func (e *EndpointSet) GetStoreClients() []store.Client { stores := make([]store.Client, 0, len(e.endpoints)) for _, er := range e.endpoints { if er.HasStoreAPI() { - stores = append(stores, er) + // Make a new endpointRef with store client. + stores = append(stores, &endpointRef{ + StoreClient: storepb.NewStoreClient(er.cc), + addr: er.addr, + metadata: er.metadata, + }) } } return stores @@ -382,7 +387,7 @@ func (e *EndpointSet) GetQueryAPIClients() []querypb.QueryClient { stores := make([]querypb.QueryClient, 0, len(e.endpoints)) for _, er := range e.endpoints { if er.HasQueryAPI() { - stores = append(stores, er.clients.query) + stores = append(stores, querypb.NewQueryClient(er.cc)) } } return stores @@ -396,7 +401,7 @@ func (e *EndpointSet) GetRulesClients() []rulespb.RulesClient { rules := make([]rulespb.RulesClient, 0, len(e.endpoints)) for _, er := range e.endpoints { if er.HasRulesAPI() { - rules = append(rules, er.clients.rule) + rules = append(rules, rulespb.NewRulesClient(er.cc)) } } return rules @@ -410,7 +415,7 @@ func (e *EndpointSet) GetTargetsClients() []targetspb.TargetsClient { targets := make([]targetspb.TargetsClient, 0, len(e.endpoints)) for _, er := range e.endpoints { if er.HasTargetsAPI() { - targets = append(targets, er.clients.target) + targets = append(targets, targetspb.NewTargetsClient(er.cc)) } } return targets @@ -424,7 +429,7 @@ func (e *EndpointSet) GetMetricMetadataClients() []metadatapb.MetadataClient { metadataClients := make([]metadatapb.MetadataClient, 0, len(e.endpoints)) for _, er := range e.endpoints { if er.HasMetricMetadataAPI() { - metadataClients = append(metadataClients, er.clients.metricMetadata) + metadataClients = append(metadataClients, metadatapb.NewMetadataClient(er.cc)) } } return metadataClients @@ -439,7 +444,7 @@ func (e *EndpointSet) GetExemplarsStores() []*exemplarspb.ExemplarStore { for _, er := range e.endpoints { if er.HasExemplarsAPI() { exemplarStores = append(exemplarStores, &exemplarspb.ExemplarStore{ - ExemplarsClient: er.clients.exemplar, + ExemplarsClient: exemplarspb.NewExemplarsClient(er.cc), LabelSets: labelpb.ZLabelSetsToPromLabelSets(er.metadata.LabelSets...), }) } @@ -498,14 +503,10 @@ func (e *EndpointSet) getActiveEndpoints(ctx context.Context, endpoints map[stri cc: conn, addr: addr, logger: e.logger, - clients: &endpointClients{ - info: infopb.NewInfoClient(conn), - store: storepb.NewStoreClient(conn), - }, } } - metadata, err := spec.Metadata(ctx, er.clients) + metadata, err := spec.Metadata(ctx, infopb.NewInfoClient(er.cc), storepb.NewStoreClient(er.cc)) if err != nil { if !seenAlready && !spec.IsStrictStatic() { // Close only if new and not a strict static node. @@ -622,8 +623,6 @@ type endpointRef struct { cc *grpc.ClientConn addr string - clients *endpointClients - // Metadata can change during runtime. metadata *endpointMetadata @@ -634,42 +633,6 @@ func (er *endpointRef) Update(metadata *endpointMetadata) { er.mtx.Lock() defer er.mtx.Unlock() - clients := er.clients - - if metadata.Store != nil { - clients.store = storepb.NewStoreClient(er.cc) - er.StoreClient = clients.store - } else { - // When we see the endpoint for the first time we assume the StoreAPI is exposed by that endpoint (which may not be true for some component, e.g. ruler) - // and we create a store API client because as a fallback we might have to call info method of storeAPI. - // In this step, we are setting it to null when we find out that the store API is not exposed. - er.clients.store = nil - er.StoreClient = nil - } - - if metadata.Rules != nil { - clients.rule = rulespb.NewRulesClient(er.cc) - } - - if metadata.Targets != nil { - clients.target = targetspb.NewTargetsClient(er.cc) - } - - if metadata.MetricMetadata != nil { - clients.metricMetadata = metadatapb.NewMetadataClient(er.cc) - } - - if metadata.Exemplars != nil { - // min/max range is also provided by in the response of Info rpc call - // but we are not using this metadata anywhere right now so ignoring. - clients.exemplar = exemplarspb.NewExemplarsClient(er.cc) - } - - if metadata.Query != nil { - clients.query = querypb.NewQueryClient(er.cc) - } - - er.clients = clients er.metadata = metadata } @@ -688,42 +651,42 @@ func (er *endpointRef) HasStoreAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.store != nil + return er.metadata != nil && er.metadata.Store != nil } func (er *endpointRef) HasQueryAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.query != nil + return er.metadata != nil && er.metadata.Query != nil } func (er *endpointRef) HasRulesAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.rule != nil + return er.metadata != nil && er.metadata.Rules != nil } func (er *endpointRef) HasTargetsAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.target != nil + return er.metadata != nil && er.metadata.Targets != nil } func (er *endpointRef) HasMetricMetadataAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.metricMetadata != nil + return er.metadata != nil && er.metadata.MetricMetadata != nil } func (er *endpointRef) HasExemplarsAPI() bool { er.mtx.RLock() defer er.mtx.RUnlock() - return er.clients != nil && er.clients.exemplar != nil + return er.metadata != nil && er.metadata.Exemplars != nil } func (er *endpointRef) LabelSets() []labels.Labels { @@ -803,16 +766,6 @@ func (er *endpointRef) apisPresent() []string { return apisPresent } -type endpointClients struct { - store storepb.StoreClient - rule rulespb.RulesClient - metricMetadata metadatapb.MetadataClient - exemplar exemplarspb.ExemplarsClient - target targetspb.TargetsClient - query querypb.QueryClient - info infopb.InfoClient -} - type endpointMetadata struct { *infopb.InfoResponse } diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 3f841a657b6..4260ed528dc 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -1193,7 +1193,6 @@ func TestDeadlockLocking(t *testing.T) { metadata: &endpointMetadata{ &infopb.InfoResponse{}, }, - clients: &endpointClients{}, } g := &errgroup.Group{} From 2a0ad56a90bd024274aed910781351beff297aeb Mon Sep 17 00:00:00 2001 From: Matej Gera Date: Mon, 4 Jul 2022 17:30:32 +0200 Subject: [PATCH 2/6] Fix E2E test env name clash Signed-off-by: Matej Gera --- test/e2e/query_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/query_test.go b/test/e2e/query_test.go index 0707685c77b..131f04bd64c 100644 --- a/test/e2e/query_test.go +++ b/test/e2e/query_test.go @@ -1276,7 +1276,7 @@ func TestSidecarAlignmentPushdown(t *testing.T) { func TestGrpcInstantQuery(t *testing.T) { t.Parallel() - e, err := e2e.NewDockerEnvironment("e2e_test_query_grpc_api") + e, err := e2e.NewDockerEnvironment("e2e_test_query_grpc_api_instant") testutil.Ok(t, err) t.Cleanup(e2ethanos.CleanScenario(t, e)) @@ -1382,7 +1382,7 @@ func TestGrpcInstantQuery(t *testing.T) { func TestGrpcQueryRange(t *testing.T) { t.Parallel() - e, err := e2e.NewDockerEnvironment("e2e_test_query_grpc_api") + e, err := e2e.NewDockerEnvironment("e2e_test_query_grpc_api_range") testutil.Ok(t, err) t.Cleanup(e2ethanos.CleanScenario(t, e)) From 3d022ab83efec6f72db4857a7fcbcf6454060ba5 Mon Sep 17 00:00:00 2001 From: Filip Petkovski Date: Fri, 8 Jul 2022 08:32:16 +0200 Subject: [PATCH 3/6] Add fpetkovski to triage list Signed-off-by: Filip Petkovski --- MAINTAINERS.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 236b7e10a87..cd4de176560 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -28,15 +28,16 @@ We also have some nice souls that help triaging issues and PRs. See [here](https Full list of triage persons is displayed below: -| Name | Slack | GitHub | Company | -|---------------|---------------|--------------------------------------------|---------| -| Adrien Fillon | `@Adrien F` | [@adrien-f](https://github.com/adrien-f) | | -| Ian Billett | `@billett` | [@bill3tt](https://github.com/bill3tt) | Red Hat | -| Martin Chodur | `@FUSAKLA` | [@fusakla](https://github.com/fusakla) | | -| Michael Dai | `@jojohappy` | [@jojohappy](https://github.com/jojohappy) | | -| Xiang Dai | `@daixiang0` | [@daixiang0](https://github.com/daixiang0) | | -| Jimmie Han | `@hanjm` | [@hanjm](https://github.com/hanjm) | Tencent | -| Matej Gera | `@Matej Gera` | [@matej-g](https://github.com/matej-g) | Red Hat | +| Name | Slack | GitHub | Company | +|-----------------|--------------------|----------------------------------------------|---------| +| Adrien Fillon | `@Adrien F` | [@adrien-f](https://github.com/adrien-f) | | +| Ian Billett | `@billett` | [@bill3tt](https://github.com/bill3tt) | Red Hat | +| Martin Chodur | `@FUSAKLA` | [@fusakla](https://github.com/fusakla) | | +| Michael Dai | `@jojohappy` | [@jojohappy](https://github.com/jojohappy) | | +| Xiang Dai | `@daixiang0` | [@daixiang0](https://github.com/daixiang0) | | +| Jimmie Han | `@hanjm` | [@hanjm](https://github.com/hanjm) | Tencent | +| Matej Gera | `@Matej Gera` | [@matej-g](https://github.com/matej-g) | Red Hat | +| Filip Petkovski | `@Filip Petkovski` | [@fpetkovski](https://github.com/fpetkovski) | | Please reach any of the maintainer on slack or email if you want to help as well. From b5d0a6167d97c0e179dc9d637fe708091da28f3c Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Fri, 8 Jul 2022 14:08:50 -0400 Subject: [PATCH 4/6] Use Azure BlobURL.Download instead of in-memory buffer (#5451) Modify the azure.Bucket get methods to use BlobURL.Download for fetching blobs and blob ranges. This avoids the need to allocate a buffer for storing the entire expected size of the object in memory. Instead, use a ReaderCloser view of the body returned by the download method. See grafana/mimir#2229 Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 1 + pkg/objstore/azure/azure.go | 38 ++++++------------------------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fab3e1aa3b..ba614260b4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Changed - [#5447](https://github.com/thanos-io/thanos/pull/5447) Promclient: Ignore 405 status codes for Prometheus buildVersion requests +- [#5451](https://github.com/thanos-io/thanos/pull/5451) Azure: Reduce memory usage by not buffering file downloads entirely in memory. ### Removed diff --git a/pkg/objstore/azure/azure.go b/pkg/objstore/azure/azure.go index 44530450c48..0f7cb1ff6b3 100644 --- a/pkg/objstore/azure/azure.go +++ b/pkg/objstore/azure/azure.go @@ -4,10 +4,8 @@ package azure import ( - "bytes" "context" "io" - "io/ioutil" "os" "strings" "testing" @@ -298,39 +296,15 @@ func (b *Bucket) getBlobReader(ctx context.Context, name string, offset, length if err != nil { return nil, errors.Wrapf(err, "cannot get Azure blob URL, address: %s", name) } - var props *blob.BlobGetPropertiesResponse - props, err = blobURL.GetProperties(ctx, blob.BlobAccessConditions{}, blob.ClientProvidedKeyOptions{}) - if err != nil { - return nil, errors.Wrapf(err, "cannot get properties for container: %s", name) - } - - var size int64 - // If a length is specified and it won't go past the end of the file, - // then set it as the size. - if length > 0 && length <= props.ContentLength()-offset { - size = length - level.Debug(b.logger).Log("msg", "set size to length", "size", size, "length", length, "offset", offset, "name", name) - } else { - size = props.ContentLength() - offset - level.Debug(b.logger).Log("msg", "set size to go to EOF", "contentlength", props.ContentLength(), "size", size, "length", length, "offset", offset, "name", name) - } - destBuffer := make([]byte, size) - - if err := blob.DownloadBlobToBuffer(context.Background(), blobURL.BlobURL, offset, size, - destBuffer, blob.DownloadFromBlobOptions{ - BlockSize: blob.BlobDefaultDownloadBlockSize, - Parallelism: uint16(3), - Progress: nil, - RetryReaderOptionsPerBlock: blob.RetryReaderOptions{ - MaxRetryRequests: b.config.ReaderConfig.MaxRetryRequests, - }, - }, - ); err != nil { - return nil, errors.Wrapf(err, "cannot download blob, address: %s", blobURL.BlobURL) + dl, err := blobURL.Download(ctx, offset, length, blob.BlobAccessConditions{}, false, blob.ClientProvidedKeyOptions{}) + if err != nil { + return nil, errors.Wrapf(err, "cannot download Azure blob, address: %s", name) } - return ioutil.NopCloser(bytes.NewReader(destBuffer)), nil + return dl.Body(blob.RetryReaderOptions{ + MaxRetryRequests: b.config.ReaderConfig.MaxRetryRequests, + }), nil } // Get returns a reader for the given object name. From 6f3e4123a9f0ee2a7309ed940b0b4bc0f168977c Mon Sep 17 00:00:00 2001 From: Bishal Das <70086051+bishal7679@users.noreply.github.com> Date: Mon, 11 Jul 2022 13:21:56 +0530 Subject: [PATCH 5/6] Update storage.md (#5486) --- docs/storage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/storage.md b/docs/storage.md index 23529200df7..0544d5276fa 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -249,7 +249,7 @@ Details about AWS policies: https://docs.aws.amazon.com/AmazonS3/latest/dev/usin If you want to use IAM credential retrieved from an instance profile, Thanos needs to authenticate through AWS STS. For this purposes you can specify your own STS Endpoint. -By default Thanos will use endpoint: https://sts.amazonaws.com and AWS region coresponding endpoints. +By default Thanos will use endpoint: https://sts.amazonaws.com and AWS region corresponding endpoints. #### GCS @@ -826,7 +826,7 @@ Every series entry first holds its number of labels, followed by tuples of symbo ##### Label Index -A label index section indexes the existing (combined) values for one or more label names. The `#names` field determines the number of indexed label names, followed by the total number of entries in the `#entries` field. The body holds #entries / #names tuples of symbol table references, each tuple being of #names length. The value tuples are sorted in lexicographically increasing order. This is no longer used. +A label index section indexes the existing (combined) values for one or more label names. The `#names` field determines the number of indexed label names, followed by the total number of entries in the `#entries` field. The body holds #entries / #names tuples of symbol table references, each tuple being of `#names` length. The value tuples are sorted in lexicographically increasing order. This is no longer used. ``` ┌───────────────┬────────────────┬────────────────┐ From e29882267398e50c656b6f69d78a6fc136360647 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 11 Jul 2022 17:54:29 +0200 Subject: [PATCH 6/6] [receive] Add per-tenant charts to Receive's example dashboard (#5472) * Start to add tenant charts to Receive Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly filter HTTP status codes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix tenant error rate chart Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor to improve readability and consistency Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor one more usage of code and tenant labels Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Filter tenant metrics to the Receive handler Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Format math expression properly Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update CHANGELOG Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add samples charts to series & samples row Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 1 + examples/dashboards/receive.json | 853 ++++++++++++++++++++++++++++- mixin/dashboards/receive.libsonnet | 141 ++++- 3 files changed, 977 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba614260b4b..fa61f456475 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5440](https://github.com/thanos-io/thanos/pull/5440) HTTP metrics: export number of in-flight HTTP requests. - [#5424](https://github.com/thanos-io/thanos/pull/5424) Receive: Export metrics regarding size of remote write requests. - [#5420](https://github.com/thanos-io/thanos/pull/5420) Receive: Automatically remove stale tenants. +- [#5472](https://github.com/thanos-io/thanos/pull/5472) Receive: add new tenant metrics to example dashboard. ### Changed diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index a103ea8d9b5..01352c42f92 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -314,6 +314,802 @@ "title": "WRITE - Incoming Request", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of write requests (by tenant and code)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Number of errors (by tenant and code)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_duration_seconds_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval])) / sum by (job, tenant) (http_request_duration_seconds_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average request duration (by tenant)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Incoming Request (tenant focus)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average successful HTTP request size (per tenant and code, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average failed HTTP request size (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant, method) (http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Inflight requests (per tenant and method)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "HTTP requests (tenant focus)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series & Samples (tenant focus)", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -326,7 +1122,7 @@ "datasource": "$datasource", "description": "Shows rate of replications to other receive nodes.", "fill": 1, - "id": 4, + "id": 14, "legend": { "avg": false, "current": false, @@ -405,7 +1201,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of replications to other receive nodes.", "fill": 10, - "id": 5, + "id": 15, "legend": { "avg": false, "current": false, @@ -493,7 +1289,7 @@ "datasource": "$datasource", "description": "Shows rate of forwarded requests to other receive nodes.", "fill": 1, - "id": 6, + "id": 16, "legend": { "avg": false, "current": false, @@ -572,7 +1368,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", "fill": 10, - "id": 7, + "id": 17, "legend": { "avg": false, "current": false, @@ -660,7 +1456,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 8, + "id": 18, "legend": { "avg": false, "current": false, @@ -811,7 +1607,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 9, + "id": 19, "legend": { "avg": false, "current": false, @@ -887,7 +1683,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 10, + "id": 20, "legend": { "avg": false, "current": false, @@ -1017,7 +1813,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 11, + "id": 21, "legend": { "avg": false, "current": false, @@ -1168,7 +1964,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 12, + "id": 22, "legend": { "avg": false, "current": false, @@ -1244,7 +2040,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 13, + "id": 23, "legend": { "avg": false, "current": false, @@ -1374,7 +2170,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests from queriers.", "fill": 10, - "id": 14, + "id": 24, "legend": { "avg": false, "current": false, @@ -1525,7 +2321,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 15, + "id": 25, "legend": { "avg": false, "current": false, @@ -1601,7 +2397,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 16, + "id": 26, "legend": { "avg": false, "current": false, @@ -1731,7 +2527,7 @@ "datasource": "$datasource", "description": "Shows the relative time of last successful upload to the object-store bucket.", "fill": 1, - "id": 17, + "id": 27, "legend": { "avg": false, "current": false, @@ -1855,7 +2651,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 28, "legend": { "avg": false, "current": false, @@ -1971,7 +2767,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 19, + "id": 29, "legend": { "avg": false, "current": false, @@ -2047,7 +2843,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 20, + "id": 30, "legend": { "avg": false, "current": false, @@ -2146,6 +2942,29 @@ "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "tenant", + "multi": false, + "name": "tenant", + "options": [ ], + "query": "label_values(http_requests_total{job=~\"$job\", tenant!=\"\"}, tenant)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "auto": true, "auto_count": 300, diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 8e3134fccf6..e8bbe8cedaf 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -1,6 +1,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; local utils = import '../lib/utils.libsonnet'; + { local thanos = self, receive+:: { @@ -9,15 +10,41 @@ local utils = import '../lib/utils.libsonnet'; dashboard:: { selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + tenantSelector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"', 'tenant=~"$tenant"']), + tenantDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'tenant']), }, }, grafanaDashboards+:: { + local grafana = import 'grafonnet/grafana.libsonnet', + local template = grafana.template, [if thanos.receive != null then 'receive.json']: local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']); local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); - g.dashboard(thanos.receive.title) + + local tenantReceiveHandlerSeclector = utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'handler="receive"']); + local tenantHttpCode2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code=~"2.."']); + local tenantHttpCodeNot2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code!~"2.."']); + + local tenantWithHttpCodeDimensions = std.join(', ', ['tenant', 'code']); + g.dashboard(thanos.receive.title) { + templating+: { + list+: [ + template.new( + 'tenant', + '$datasource', + 'label_values(http_requests_total{%s}, %s)' % [std.join(', ', [thanos.receive.dashboard.selector] + ['tenant!=""']), 'tenant'], + label='tenant', + refresh=1, + sort=2, + current='all', + allValues=null, + includeAll=true + ), + ], + }, + } .addRow( g.row('WRITE - Incoming Request') .addPanel( @@ -33,6 +60,118 @@ local utils = import '../lib/utils.libsonnet'; g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) ) + .addRow( + g.row('WRITE - Incoming Request (tenant focus)') + .addPanel( + g.panel('Rate of write requests (by tenant and code)') + + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [tenantWithHttpCodeDimensions, tenantReceiveHandlerSeclector], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Number of errors (by tenant and code)') + + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [ + tenantWithHttpCodeDimensions, + tenantHttpCodeNot2XXSelector, + ], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Average request duration (by tenant)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [ + thanos.receive.dashboard.tenantDimensions, + tenantReceiveHandlerSeclector, + thanos.receive.dashboard.tenantDimensions, + tenantReceiveHandlerSeclector, + ], + '{{tenant}}' + ) + ) + ) + .addRow( + g.row('HTTP requests (tenant focus)') + .addPanel( + g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Inflight requests (per tenant and method)') + + g.queryPanel( + 'sum by (%s) (http_inflight_requests{%s})' % [ + std.join(', ', [thanos.receive.dashboard.tenantDimensions, 'method']), + tenantReceiveHandlerSeclector, + ], + '{{method}} - {{tenant}}' + ) + ) + ) + .addRow( + g.row('Series & Samples (tenant focus)') + .addPanel( + g.panel('Rate of series received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), + thanos.receive.dashboard.tenantDimensions, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of series not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, + ], + '{{code}} - {{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of samples received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), + thanos.receive.dashboard.tenantDimensions, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of samples not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, + ], + '{{code}} - {{tenant}}' + ) + ) + ) .addRow( g.row('WRITE - Replication') .addPanel(