From 989efe5b2d68c21de00a212e74cb096e16464575 Mon Sep 17 00:00:00 2001 From: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Date: Thu, 27 Oct 2022 08:03:54 +0100 Subject: [PATCH] Query: Trim very long external labels and add cmd flag to optionally specify metric labels to collect (#5785) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added unit tests for long labels and no external labels Signed-off-by: utukj * trimmed too long external labels Signed-off-by: utukj * added optional label selection Signed-off-by: utukj * added cmd flag for choosing metric labels Signed-off-by: utukj * updated docs Signed-off-by: utukj * Update pkg/query/endpointset.go full sentence fix Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * Update pkg/query/endpointset.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * minor fixes from code review Signed-off-by: utukj * fixed code comments Signed-off-by: utukj * used enum for labels Signed-off-by: utukj * updated query docs Signed-off-by: utukj * cleaned up tests Signed-off-by: utukj * Updates busybox SHA (#5793) Signed-off-by: GitHub Signed-off-by: GitHub Co-authored-by: yeya24 Signed-off-by: utukj * Receive: Reload tenant limit configuration on file change (#5673) * Create a PathOrContent reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add docs to staticPathContent.Rewrite Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run goimports Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly cancel the context in the test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Watch parent directory of file This helps handling deletes and other situations. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless ctx.Done() Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add a debounce timer to config reload It helps managing situations where a create event is followed by a write or when a big file write is sent by the fsnotify backend as many write events. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix event.Op bitmask check Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update lastReload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix debouncer for path content reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Improve documentation of the PathContentRealoder Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Dain reload timer before resetting Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run tests in parallel Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify debouncing logic Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add more tests to file reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify condition for triggering reload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use absolute path to config file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of parallel test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back 2s wait between fs operations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless sleep Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Stop reloadTimer when context cancelled Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove unused fucntion Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright to test file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Auto-reload tenant limit config on file changes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Wrap error when reloading config Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Move limiter config reloader and update logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of useless types and allocations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove errorChan from config reload starter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use UnRegisterer in the Limiter To ensure that limit reloads will be able to re-register their metrics. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Better guard against nil registerer in the limiter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove wrong nil guard Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: utukj * Query: add query metrics to calls going through the Store API (#5741) * Implement granular query performance metrics for Thanos Query These are grabbed from the data returned by multiple Store APIs after execution of a query. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix some linter warnings Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix long function definition (newQuerier) Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove TODO comment Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Reformat query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless return Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back old query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update query docs again Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix e2e env name Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright notice. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Bump wait time to twice scrape interval Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Attempt to fix randomly failing test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Checking more metrics to ensure the store is ready Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Clean up test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Do not record store api metrics when didn't touch series or samples Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Also skip store api metrics on zero chunks touched Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update changelog Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix broken changelog after merge Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove extra empty line Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor names and (un)exported types and fields Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Start listing metrics exported by Thanos Query Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Rename pkg/store/metrics -> pkg/store/telemetry Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of the pkg/store/telemetry package Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Co-authored-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Signed-off-by: utukj * docs: mark me as shepherd for next release (#5797) Let's release the RC on Friday. Signed-off-by: Giedrius Statkevičius Signed-off-by: Giedrius Statkevičius Signed-off-by: utukj * Revert "docs: mark me as shepherd for next release (#5797)" This reverts commit ea646a6010e4fc11997880d696ec7b2718b7ecf2. Signed-off-by: utukj * Revert "Query: add query metrics to calls going through the Store API (#5741)" This reverts commit eec4fd0053b841ee5fe5284d0fe25e6cbe14a738. Signed-off-by: utukj * Revert "Receive: Reload tenant limit configuration on file change (#5673)" This reverts commit 24e1cc0faf219049174020955f8e3c8251106d87. Signed-off-by: utukj * Revert "Updates busybox SHA (#5793)" This reverts commit 9474c00fa6a1a7b0148287ee4296944e50f093b6. Signed-off-by: utukj * Updates busybox SHA (#5793) Signed-off-by: GitHub Signed-off-by: GitHub Co-authored-by: yeya24 Signed-off-by: utukj * Receive: Reload tenant limit configuration on file change (#5673) * Create a PathOrContent reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add docs to staticPathContent.Rewrite Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run goimports Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Properly cancel the context in the test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Watch parent directory of file This helps handling deletes and other situations. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless ctx.Done() Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add a debounce timer to config reload It helps managing situations where a create event is followed by a write or when a big file write is sent by the fsnotify backend as many write events. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix event.Op bitmask check Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update lastReload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix debouncer for path content reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Improve documentation of the PathContentRealoder Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Dain reload timer before resetting Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Run tests in parallel Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify debouncing logic Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add more tests to file reloader Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Simplify condition for triggering reload Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use absolute path to config file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of parallel test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back 2s wait between fs operations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless sleep Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Stop reloadTimer when context cancelled Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove unused fucntion Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright to test file Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Auto-reload tenant limit config on file changes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Wrap error when reloading config Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Move limiter config reloader and update logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of useless types and allocations Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove errorChan from config reload starter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Use UnRegisterer in the Limiter To ensure that limit reloads will be able to re-register their metrics. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Better guard against nil registerer in the limiter Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove wrong nil guard Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: utukj * Query: add query metrics to calls going through the Store API (#5741) * Implement granular query performance metrics for Thanos Query These are grabbed from the data returned by multiple Store APIs after execution of a query. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix some linter warnings Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless logs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix long function definition (newQuerier) Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove TODO comment Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix query tests Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Reformat query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove useless return Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Put back old query docs Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update query docs again Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix e2e env name Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Add missing copyright notice. Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Bump wait time to twice scrape interval Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Attempt to fix randomly failing test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Checking more metrics to ensure the store is ready Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Clean up test Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Do not record store api metrics when didn't touch series or samples Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Retrigger CI Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Also skip store api metrics on zero chunks touched Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Update changelog Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Fix broken changelog after merge Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Remove extra empty line Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Refactor names and (un)exported types and fields Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Start listing metrics exported by Thanos Query Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Rename pkg/store/metrics -> pkg/store/telemetry Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> * Get rid of the pkg/store/telemetry package Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Co-authored-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Signed-off-by: utukj * docs: mark me as shepherd for next release (#5797) Let's release the RC on Friday. Signed-off-by: Giedrius Statkevičius Signed-off-by: Giedrius Statkevičius Signed-off-by: utukj * Revert "docs: mark me as shepherd for next release (#5797)" This reverts commit c509c0e9f22deb3f75b707f88285548876eb7ffb. Signed-off-by: utukj * Revert "Updates busybox SHA (#5793)" This reverts commit ad11a03ebeb6ae10ed7ad7a6336365f33a07538b. Signed-off-by: utukj * Revert "Query: add query metrics to calls going through the Store API (#5741)" This reverts commit 7a77769258a138394be68a018be5f0c871afee7a. Signed-off-by: utukj * Revert "Receive: Reload tenant limit configuration on file change (#5673)" This reverts commit 32ca3279bb995861b7f3b7ba5b9cb4cbeeddf68c. Signed-off-by: utukj * fixed lint issue Signed-off-by: utukj * added unit test for truncate and clean up Signed-off-by: utukj * fixed truncate label func and added more tests Signed-off-by: utukj * removed name from truncate test Signed-off-by: utukj * reorganized test cases and removed redundant comments Signed-off-by: utukj * Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * Update pkg/query/endpointset.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * fixed failing checks Signed-off-by: utukj * e2e: Adding test for querier with two stores loadbalancing across them. Signed-off-by: bwplotka * Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * dumped long expected output in unittest Signed-off-by: utukj * Revert "e2e: Adding test for querier with two stores loadbalancing across them." This reverts commit 96b1545048e01003dcbf90029d70137a93d0d196. Signed-off-by: utukj * Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * Update pkg/query/endpointset_test.go Co-authored-by: Bartlomiej Plotka Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> * moved label definition to endpointset Signed-off-by: utukj Signed-off-by: utukj Signed-off-by: Uwakmfon Utuk <41128987+utukJ@users.noreply.github.com> Signed-off-by: GitHub Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Signed-off-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Signed-off-by: Giedrius Statkevičius Signed-off-by: bwplotka Co-authored-by: Bartlomiej Plotka Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: yeya24 Co-authored-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> Co-authored-by: Matej Gera <38492574+matej-g@users.noreply.github.com> Co-authored-by: Giedrius Statkevičius --- cmd/thanos/query.go | 7 +++ docs/components/query.md | 3 + pkg/query/endpointset.go | 51 ++++++++++++++--- pkg/query/endpointset_test.go | 105 ++++++++++++++++++++++++++++++---- 4 files changed, 147 insertions(+), 19 deletions(-) diff --git a/cmd/thanos/query.go b/cmd/thanos/query.go index 38df8d8a4f..4862440af1 100644 --- a/cmd/thanos/query.go +++ b/cmd/thanos/query.go @@ -109,6 +109,10 @@ func registerQuery(app *extkingpin.App) { maxConcurrentSelects := cmd.Flag("query.max-concurrent-select", "Maximum number of select requests made concurrently per a query."). Default("4").Int() + queryConnMetricLabels := cmd.Flag("query.conn-metric.label", "Optional selection of query connection metric labels to be collected from endpoint set"). + Default(string(query.ExternalLabels), string(query.StoreType)). + Enums(string(query.ExternalLabels), string(query.StoreType)) + queryReplicaLabels := cmd.Flag("query.replica-label", "Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. Data includes time series, recording rules, and alerting rules."). Strings() @@ -280,6 +284,7 @@ func registerQuery(app *extkingpin.App) { *dynamicLookbackDelta, time.Duration(*defaultEvaluationInterval), time.Duration(*storeResponseTimeout), + *queryConnMetricLabels, *queryReplicaLabels, selectorLset, getFlagsMap(cmd.Flags()), @@ -355,6 +360,7 @@ func runQuery( dynamicLookbackDelta bool, defaultEvaluationInterval time.Duration, storeResponseTimeout time.Duration, + queryConnMetricLabels []string, queryReplicaLabels []string, selectorLset labels.Labels, flagsMap map[string]string, @@ -497,6 +503,7 @@ func runQuery( dialOpts, unhealthyStoreTimeout, endpointInfoTimeout, + queryConnMetricLabels..., ) proxy = store.NewProxyStore(logger, reg, endpoints.GetStoreClients, component.Query, selectorLset, storeResponseTimeout, store.RetrievalStrategy(grpcProxyStrategy)) rulesProxy = rules.NewProxy(logger, endpoints.GetRulesClients) diff --git a/docs/components/query.md b/docs/components/query.md index e1c3ac2112..c3690ca05a 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -335,6 +335,9 @@ Flags: --query.auto-downsampling Enable automatic adjustment (step / 5) to what source of data should be used in store gateways if no max_source_resolution param is specified. + --query.conn-metric.label=external_labels... ... + Optional selection of query connection metric + labels to be collected from endpoint set --query.default-evaluation-interval=1m Set default evaluation interval for sub queries. diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index 1660f85853..d3ce73b9e7 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -38,11 +38,20 @@ const ( noMetadataEndpointMessage = "cannot obtain metadata: neither info nor store client found" ) +type queryConnMetricLabel string + +const ( + ExternalLabels queryConnMetricLabel = "external_labels" + StoreType queryConnMetricLabel = "store_type" +) + type GRPCEndpointSpec struct { addr string isStrictStatic bool } +const externalLabelLimit = 1000 + // NewGRPCEndpointSpec creates gRPC endpoint spec. // It uses InfoAPI to get Metadata. func NewGRPCEndpointSpec(addr string, isStrictStatic bool) *GRPCEndpointSpec { @@ -183,28 +192,41 @@ type endpointSetNodeCollector struct { storePerExtLset map[string]int connectionsDesc *prometheus.Desc + labels []string } -func newEndpointSetNodeCollector() *endpointSetNodeCollector { +func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { + if len(labels) == 0 { + labels = []string{string(ExternalLabels), string(StoreType)} + } return &endpointSetNodeCollector{ storeNodes: map[component.Component]map[string]int{}, connectionsDesc: prometheus.NewDesc( "thanos_store_nodes_grpc_connections", "Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier.", - []string{"external_labels", "store_type"}, nil, + labels, nil, ), + labels: labels, } } +// truncateExtLabels truncates the stringify external labels with the format of {labels..}. +func truncateExtLabels(s string, threshold int) string { + if len(s) > threshold { + return fmt.Sprintf("%s}", s[:threshold-1]) + } + return s +} func (c *endpointSetNodeCollector) Update(nodes map[component.Component]map[string]int) { storeNodes := make(map[component.Component]map[string]int, len(nodes)) storePerExtLset := map[string]int{} - for k, v := range nodes { - storeNodes[k] = make(map[string]int, len(v)) - for kk, vv := range v { - storePerExtLset[kk] += vv - storeNodes[k][kk] = vv + for storeType, occurrencesPerExtLset := range nodes { + storeNodes[storeType] = make(map[string]int, len(occurrencesPerExtLset)) + for externalLabels, occurrences := range occurrencesPerExtLset { + externalLabels = truncateExtLabels(externalLabels, externalLabelLimit) + storePerExtLset[externalLabels] += occurrences + storeNodes[storeType][externalLabels] = occurrences } } @@ -228,7 +250,17 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { if storeType != nil { storeTypeStr = storeType.String() } - ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), externalLabels, storeTypeStr) + // Select only required labels. + lbls := []string{} + for _, lbl := range c.labels { + switch lbl { + case string(ExternalLabels): + lbls = append(lbls, externalLabels) + case string(StoreType): + lbls = append(lbls, storeTypeStr) + } + } + ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), lbls...) } } } @@ -268,8 +300,9 @@ func NewEndpointSet( dialOpts []grpc.DialOption, unhealthyEndpointTimeout time.Duration, endpointInfoTimeout time.Duration, + endpointMetricLabels ...string, ) *EndpointSet { - endpointsMetric := newEndpointSetNodeCollector() + endpointsMetric := newEndpointSetNodeCollector(endpointMetricLabels...) if reg != nil { reg.MustRegister(endpointsMetric) } diff --git a/pkg/query/endpointset_test.go b/pkg/query/endpointset_test.go index 1274648b93..7e84890ea3 100644 --- a/pkg/query/endpointset_test.go +++ b/pkg/query/endpointset_test.go @@ -9,6 +9,7 @@ import ( "fmt" "math" "net" + "strings" "sync" "testing" "time" @@ -23,6 +24,7 @@ import ( "google.golang.org/grpc/credentials/insecure" "github.com/pkg/errors" + promtestutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/info/infopb" "github.com/thanos-io/thanos/pkg/store/labelpb" @@ -269,13 +271,55 @@ func (e *testEndpoints) CloseOne(addr string) { delete(e.srvs, addr) } +func TestTruncateExtLabels(t *testing.T) { + const testLength = 5 + + for _, tc := range []struct { + labelToTruncate string + expectedOutput string + }{ + { + labelToTruncate: "{abc}", + expectedOutput: "{abc}", + }, + { + labelToTruncate: "{abcd}", + expectedOutput: "{abc}", + }, + { + labelToTruncate: "{abcde}", + expectedOutput: "{abc}", + }, + { + labelToTruncate: "{abcdef}", + expectedOutput: "{abc}", + }, + { + labelToTruncate: "{abcdefghij}", + expectedOutput: "{abc}", + }, + } { + t.Run(tc.labelToTruncate, func(t *testing.T) { + got := truncateExtLabels(tc.labelToTruncate, testLength) + testutil.Equals(t, tc.expectedOutput, got) + testutil.Assert(t, len(got) <= testLength) + }) + } +} + func TestEndpointSetUpdate(t *testing.T) { + const metricsMeta = ` + # HELP thanos_store_nodes_grpc_connections Number of gRPC connection to Store APIs. Opened connection means healthy store APIs available for Querier. + # TYPE thanos_store_nodes_grpc_connections gauge + ` testCases := []struct { - name string - endpoints []testEndpointMeta - strict bool + name string + endpoints []testEndpointMeta + strict bool + connLabels []string - expectedEndpoints int + expectedEndpoints int + expectedConnMetrics string }{ { name: "available endpoint", @@ -289,7 +333,13 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, + connLabels: []string{"store_type"}, + expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + + ` + thanos_store_nodes_grpc_connections{store_type="sidecar"} 1 + `, }, { name: "unavailable endpoint", @@ -304,7 +354,9 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, - expectedEndpoints: 0, + + expectedEndpoints: 0, + expectedConnMetrics: "", }, { name: "slow endpoint", @@ -319,7 +371,9 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, }, - expectedEndpoints: 0, + + expectedEndpoints: 0, + expectedConnMetrics: "", }, { name: "strict endpoint", @@ -334,7 +388,35 @@ func TestEndpointSetUpdate(t *testing.T) { }, }, strict: true, + connLabels: []string{"store_type"}, + expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + + ` + thanos_store_nodes_grpc_connections{store_type="sidecar"} 1 + `, + }, + { + name: "long external labels", + endpoints: []testEndpointMeta{ + { + InfoResponse: sidecarInfo, + // Simulate very long external labels. + extlsetFn: func(addr string) []labelpb.ZLabelSet { + sLabel := []string{} + for i := 0; i < 1000; i++ { + sLabel = append(sLabel, "lbl") + sLabel = append(sLabel, "val") + } + return labelpb.ZLabelSetsFromPromLabels( + labels.FromStrings(sLabel...), + ) + }, + }, + }, expectedEndpoints: 1, + expectedConnMetrics: metricsMeta + ` + thanos_store_nodes_grpc_connections{external_labels="{lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val\", lbl=\"val}",store_type="sidecar"} 1 + `, }, } @@ -345,12 +427,15 @@ func TestEndpointSetUpdate(t *testing.T) { defer endpoints.Close() discoveredEndpointAddr := endpoints.EndpointAddresses() - endpointSet := makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now) + // Specify only "store_type" to exclude "external_labels". + endpointSet := makeEndpointSet(discoveredEndpointAddr, tc.strict, time.Now, tc.connLabels...) defer endpointSet.Close() endpointSet.Update(context.Background()) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetEndpointStatus())) testutil.Equals(t, tc.expectedEndpoints, len(endpointSet.GetStoreClients())) + + testutil.Ok(t, promtestutil.CollectAndCompare(endpointSet.endpointsMetric, strings.NewReader(tc.expectedConnMetrics))) }) } } @@ -576,7 +661,7 @@ func TestEndpointSetUpdate_AtomicEndpointAdditions(t *testing.T) { wg.Wait() } -func TestEndpointSet_Update(t *testing.T) { +func TestEndpointSetUpdate_AvailabilityScenarios(t *testing.T) { endpoints, err := startTestEndpoints([]testEndpointMeta{ { InfoResponse: sidecarInfo, @@ -1493,7 +1578,7 @@ func TestUpdateEndpointStateForgetsPreviousErrors(t *testing.T) { testutil.Equals(t, `null`, string(b)) } -func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc) *EndpointSet { +func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc, metricLabels ...string) *EndpointSet { endpointSet := NewEndpointSet(now, nil, nil, func() (specs []*GRPCEndpointSpec) { for _, addr := range discoveredEndpointAddr { @@ -1501,7 +1586,7 @@ func makeEndpointSet(discoveredEndpointAddr []string, strict bool, now nowFunc) } return specs }, - testGRPCOpts, time.Minute, time.Second) + testGRPCOpts, time.Minute, time.Second, metricLabels...) return endpointSet }