grafana · joe-elliott · Mar 29, 2022 · Mar 22, 2022 · Mar 22, 2022 · Mar 23, 2022
@@ -19,13 +19,29 @@
 * [ENHANCEMENT] Partially persist traces that exceed `max_bytes_per_trace` during compaction [#1317](https://github.com/grafana/tempo/pull/1317) (@joe-elliott)
 * [ENHANCEMENT] Make search respect per tenant `max_bytes_per_trace` and added `skippedTraces` to returned search metrics. [#1318](https://github.com/grafana/tempo/pull/1318) (@joe-elliott)
 * [ENHANCEMENT] Improve serverless consistency by forcing a GC before returning. [#1324](https://github.com/grafana/tempo/pull/1324) (@joe-elliott)
-* [ENHANCEMENT] Add hedging to sharded search queries created by the frontend. [#1334](https://github.com/grafana/tempo/pull/1334) (@joe-elliott)
+* [ENHANCEMENT] Add hedging to queries to external endpoints. [#1350](https://github.com/grafana/tempo/pull/1350) (@joe-elliott)
   New config options and defaults:
   ```
-  query_frontend:
+  querier:
+    search:
+      external_hedge_requests_at: 5s
+      external_hedge_requests_up_to: 3
+  ```
+  ** BREAKING CHANGE **
+  Querier options related to search have moved under a `search` block:
+  ```
+  querier:
+   search_query_timeout: 30s
+   search_external_endpoints: []
+   search_prefer_self: 2
+  ```
+  becomes
+  ```
+  querier:
     search:
-      hedge_requests_at: 5s
-      hedge_requests_up_to: 3
+      query_timeout: 30s
+      prefer_self: 2
+      external_endpoints: []
   ```
 * [BUGFIX]: Enable compaction and retention in Tanka single-binary [#1352](https://github.com/grafana/tempo/issues/1352)
 * [BUGFIX]: Remove unnecessary PersistentVolumeClaim [#1245](https://github.com/grafana/tempo/issues/1245)

@@ -231,15 +231,6 @@ query_frontend:
 
         # (default: 1h)
         [query_ingesters_until: <duration>]
-
-        # If set to a non-zero value a second request will be issued at the provided duration. Recommended to
-        # be set to p99 of search requests to reduce long tail latency.
-        # (default: 5s)
-        [hedge_requests_at: <duration>]
-
-        # The maximum number of requests to execute when hedging. Requires hedge_requests_at to be set.
-        # (default: 3)
-        [hedge_requests_up_to: <int>]
 ```
 
 ## Querier
@@ -254,28 +245,38 @@ querier:
     # Timeout for trace lookup requests
     [query_timeout: <duration> | default = 10s]
 
-    # Timeout for search requests    
-    [search_query_timeout: <duration> | default = 30s]
-
-    # A list of external endpoints that the querier will use to offload backend search requests. They must  
-    # take and return the same value as /api/search endpoint on the querier. This is intended to be
-    # used with serverless technologies for massive parrallelization of the search path.
-    # The default value of "" disables this feature.
-    [search_external_endpoints: <list of strings> | default = <empty list>]
-
-    # If search_external_endpoints is set then the querier will primarily act as a proxy for whatever serverless backend
-    # you have configured. This setting allows the operator to have the querier prefer itself for a configurable
-    # number of subqueries. In the default case of 2 the querier will process up to 2 search requests subqueries before starting
-    # to reach out to search_external_endpoints. 
-    # Setting this to 0 will disable this feature and the querier will proxy all search subqueries to search_external_endpoints.
-    [search_prefer_self: <int> | default = 2 ]
-
     # The query frontend turns both trace by id (/api/traces/<id>) and search (/api/search?<params>) requests
     # into subqueries that are then pulled and serviced by the queriers.
     # This value controls the overall number of simultaneous subqueries that the querier will service at once. It does
     # not distinguish between the types of queries.
     [max_concurrent_queries: <int> | default = 5]
 
+    search:
+        # Timeout for search requests    
+        [query_timeout: <duration> | default = 30s]
+
+        # A list of external endpoints that the querier will use to offload backend search requests. They must  
+        # take and return the same value as /api/search endpoint on the querier. This is intended to be
+        # used with serverless technologies for massive parrallelization of the search path.
+        # The default value of "" disables this feature.
+        [external_endpoints: <list of strings> | default = <empty list>]
+
+        # If search_external_endpoints is set then the querier will primarily act as a proxy for whatever serverless backend
+        # you have configured. This setting allows the operator to have the querier prefer itself for a configurable
+        # number of subqueries. In the default case of 2 the querier will process up to 2 search requests subqueries before starting
+        # to reach out to search_external_endpoints. 
+        # Setting this to 0 will disable this feature and the querier will proxy all search subqueries to search_external_endpoints.
+        [prefer_self: <int> | default = 2 ]
+
+        # If set to a non-zero value a second request will be issued at the provided duration. Recommended to
+        # be set to p99 of external search requests to reduce long tail latency.
+        # (default: 4s)
+        [external_hedge_requests_at: <duration>]
+
+        # The maximum number of requests to execute when hedging. Requires hedge_requests_at to be set.
+        # (default: 3)
+        [external_hedge_requests_up_to: <int>]
+
     # config of the worker that connects to the query frontend
     frontend_worker:
 

@@ -16,7 +16,7 @@ go run ./cmd/tempo --storage.trace.backend=local --storage.trace.local.path=/tmp
 
 ## Complete Configuration
 
-> **Note**: This manifest was generated on 2022-03-09.
+> **Note**: This manifest was generated on 2022-03-23.
 
 ```yaml
 target: all
@@ -151,10 +151,13 @@ metrics_generator_client:
     tls_server_name: ""
     tls_insecure_skip_verify: false
 querier:
+  search:
+    query_timeout: 30s
+    prefer_self: 2
+    external_endpoints: []
+    external_hedge_requests_at: 4s
+    external_hedge_requests_up_to: 3
   query_timeout: 10s
-  search_query_timeout: 30s
-  search_external_endpoints: []
-  search_prefer_self: 2
   max_concurrent_queries: 5
   frontend_worker:
     frontend_address: 127.0.0.1:9095
@@ -220,8 +223,6 @@ query_frontend:
     max_duration: 1h1m0s
     query_backend_after: 15m0s
     query_ingesters_until: 1h0m0s
-    hedge_requests_at: 5s
-    hedge_requests_up_to: 3
 compactor:
   ring:
     kvstore:
@@ -273,6 +274,7 @@ compactor:
     retention_concurrency: 10
     iterator_buffer_size: 1000
     max_time_per_tenant: 5m0s
+    compaction_cycle: 30s
   override_ring_key: compactor
 ingester:
   lifecycler:

@@ -45,10 +45,11 @@ querier:
   # Increase this greatly to permit needed throughput.
   max_concurrent_queries: 100
 
-  # A list of endpoints to query. Load will be spread evenly across
-  # these multiple serverless functions.
-  search_external_endpoints:
-  - https://<serverless endpoint>
+  search:
+    # A list of endpoints to query. Load will be spread evenly across
+    # these multiple serverless functions.
+    external_endpoints:
+    - https://<serverless endpoint>
 ```
 
 ### Query frontend
@@ -94,18 +95,28 @@ settings to configure Tempo to use the serverless:
 
 ```
 querier:
+  search:
     # A list of external endpoints that the querier will use to offload backend search requests. They must  
     # take and return the same value as /api/search endpoint on the querier. This is intended to be
     # used with serverless technologies for massive parrallelization of the search path.
     # The default value of "" disables this feature.
-    [search_external_endpoints: <list of strings> | default = <empty list>]
+    [external_endpoints: <list of strings> | default = <empty list>]
 
-    # If search_external_endpoints is set then the querier will primarily act as a proxy for whatever serverless backend
+    # If external_endpoints is set then the querier will primarily act as a proxy for whatever serverless backend
     # you have configured. This setting allows the operator to have the querier prefer itself for a configurable
     # number of subqueries. In the default case of 2 the querier will process up to 2 search requests subqueries before starting
-    # to reach out to search_external_endpoints. 
-    # Setting this to 0 will disable this feature and the querier will proxy all search subqueries to search_external_endpoints.
-    [search_prefer_self: <int> | default = 2 ]
+    # to reach out to external_endpoints. 
+    # Setting this to 0 will disable this feature and the querier will proxy all search subqueries to external_endpoints.
+    [prefer_self: <int> | default = 2 ]
+
+    # If set to a non-zero value a second request will be issued at the provided duration. Recommended to
+    # be set to p99 of external search requests to reduce long tail latency.
+    # (default: 4s)
+    [external_hedge_requests_at: <duration>]
+
+    # The maximum number of requests to execute when hedging. Requires hedge_requests_at to be set.
+    # (default: 3)
+    [external_hedge_requests_up_to: <int>]
 ```
 
 See here for cloud-specific details:

@@ -76,6 +76,7 @@ For more guidance on configuration options for full backend search [check here](
 
     ```
     querier:
-      search_external_endpoints:
-      - http://<alb dns hostname>
+      search:
+        external_endpoints:
+        - http://<alb dns hostname>
     ```
@@ -72,6 +72,7 @@ The endpoint can be retrieved from the trigger tab in Google Cloud Functions:
 
     ```
     querier:
-      search_external_endpoints:
-      - <trigger url from console>
+      search:
+        external_endpoints:
+        - <trigger url from console>
     ```
@@ -51,7 +51,8 @@ memberlist:
   - tempo_e2e-querier:7946
 
 querier:
-  search_external_endpoints: 
-  - http://tempo_e2e-serverless:8080/
+  search:
+    external_endpoints: 
+    - http://tempo_e2e-serverless:8080/
   frontend_worker:
     frontend_address: tempo_e2e-query-frontend:9095
@@ -25,9 +25,7 @@ type Config struct {
 }
 
 type SearchConfig struct {
-	Sharder           SearchSharderConfig `yaml:",inline"`
-	HedgeRequestsAt   time.Duration       `yaml:"hedge_requests_at"`
-	HedgeRequestsUpTo int                 `yaml:"hedge_requests_up_to"`
+	Sharder SearchSharderConfig `yaml:",inline"` // jpe - what did this look like before
 }
 
 func (cfg *Config) RegisterFlagsAndApplyDefaults(prefix string, f *flag.FlagSet) {
@@ -38,8 +36,6 @@ func (cfg *Config) RegisterFlagsAndApplyDefaults(prefix string, f *flag.FlagSet)
 	cfg.QueryShards = 20
 	cfg.TolerateFailedBlocks = 0
 	cfg.Search = SearchConfig{
-		HedgeRequestsAt:   5 * time.Second,
-		HedgeRequestsUpTo: 3,
 		Sharder: SearchSharderConfig{
 			QueryBackendAfter:     15 * time.Minute,
 			QueryIngestersUntil:   time.Hour,

@@ -67,16 +67,7 @@ func New(cfg Config, next http.RoundTripper, store storage.Store, logger log.Log
 
 	// tracebyid middleware
 	traceByIDMiddleware := MergeMiddlewares(newTraceByIDMiddleware(cfg, logger), retryWare)
-
-	// search pipeline and middleware
-	searchPipeline := []Middleware{
-		newSearchMiddleware(cfg, store, logger),
-	}
-	if cfg.Search.HedgeRequestsAt > 0 {
-		searchPipeline = append(searchPipeline, newHedgedRequestWare(cfg.Search.HedgeRequestsAt, cfg.Search.HedgeRequestsUpTo))
-	}
-	searchPipeline = append(searchPipeline, retryWare)
-	searchMiddleware := MergeMiddlewares(searchPipeline...)
+	searchMiddleware := MergeMiddlewares(newSearchMiddleware(cfg, store, logger), retryWare)
 
 	traceByIDCounter := queriesPerTenant.MustCurryWith(prometheus.Labels{
 		"op": traceByIDOp,

@@ -11,22 +11,31 @@ import (
 
 // Config for a querier.
 type Config struct {
+	Search SearchConfig `yaml:"search"`
+
 	TraceLookupQueryTimeout time.Duration `yaml:"query_timeout"`
-	SearchQueryTimeout      time.Duration `yaml:"search_query_timeout"`
-	SearchExternalEndpoints []string      `yaml:"search_external_endpoints"`
-	SearchPreferSelf        int           `yaml:"search_prefer_self"`
 	ExtraQueryDelay         time.Duration `yaml:"extra_query_delay,omitempty"`
 	MaxConcurrentQueries    int           `yaml:"max_concurrent_queries"`
 	Worker                  worker.Config `yaml:"frontend_worker"`
 }
 
+type SearchConfig struct {
+	QueryTimeout      time.Duration `yaml:"query_timeout"`
+	PreferSelf        int           `yaml:"prefer_self"`
+	ExternalEndpoints []string      `yaml:"external_endpoints"`
+	HedgeRequestsAt   time.Duration `yaml:"external_hedge_requests_at"`
+	HedgeRequestsUpTo int           `yaml:"external_hedge_requests_up_to"`
+}
+
 // RegisterFlagsAndApplyDefaults register flags.
 func (cfg *Config) RegisterFlagsAndApplyDefaults(prefix string, f *flag.FlagSet) {
 	cfg.TraceLookupQueryTimeout = 10 * time.Second
-	cfg.SearchQueryTimeout = 30 * time.Second
 	cfg.ExtraQueryDelay = 0
 	cfg.MaxConcurrentQueries = 5
-	cfg.SearchPreferSelf = 2
+	cfg.Search.PreferSelf = 2
+	cfg.Search.HedgeRequestsAt = 4 * time.Second
+	cfg.Search.HedgeRequestsUpTo = 3
+	cfg.Search.QueryTimeout = 30 * time.Second
 	cfg.Worker = worker.Config{
 		MatchMaxConcurrency:   true,
 		MaxConcurrentRequests: cfg.MaxConcurrentQueries,

@@ -149,7 +149,7 @@ func (q *Querier) SearchHandler(w http.ResponseWriter, r *http.Request) {
 	isSearchBlock := api.IsSearchBlock(r)
 
 	// Enforce the query timeout while querying backends
-	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.SearchQueryTimeout))
+	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.Search.QueryTimeout))
 	defer cancel()
 
 	span, ctx := opentracing.StartSpanFromContext(ctx, "Querier.SearchHandler")
@@ -200,7 +200,7 @@ func (q *Querier) SearchHandler(w http.ResponseWriter, r *http.Request) {
 
 func (q *Querier) SearchTagsHandler(w http.ResponseWriter, r *http.Request) {
 	// Enforce the query timeout while querying backends
-	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.SearchQueryTimeout))
+	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.Search.QueryTimeout))
 	defer cancel()
 
 	span, ctx := opentracing.StartSpanFromContext(ctx, "Querier.SearchTagsHandler")
@@ -225,7 +225,7 @@ func (q *Querier) SearchTagsHandler(w http.ResponseWriter, r *http.Request) {
 
 func (q *Querier) SearchTagValuesHandler(w http.ResponseWriter, r *http.Request) {
 	// Enforce the query timeout while querying backends
-	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.SearchQueryTimeout))
+	ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.Search.QueryTimeout))
 	defer cancel()
 
 	span, ctx := opentracing.StartSpanFromContext(ctx, "Querier.SearchTagValuesHandler")