diff --git a/CHANGELOG.md b/CHANGELOG.md index c9fac6460a..619d68a35d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#7123](https://github.com/thanos-io/thanos/pull/7123) Rule: Change default Alertmanager API version to v2. - [#7223](https://github.com/thanos-io/thanos/pull/7223) Automatic detection of memory limits and configure GOMEMLIMIT to match. +- [#7283](https://github.com/thanos-io/thanos/pull/7283) Compact: *breaking :warning:* Replace group with resolution in compact downsample metrics to avoid cardinality explosion with large numbers of groups. ### Removed diff --git a/cmd/thanos/compact.go b/cmd/thanos/compact.go index 8923bd376e..f1437efc64 100644 --- a/cmd/thanos/compact.go +++ b/cmd/thanos/compact.go @@ -456,9 +456,9 @@ func runCompact( } for _, meta := range filteredMetas { - groupKey := meta.Thanos.GroupKey() - downsampleMetrics.downsamples.WithLabelValues(groupKey) - downsampleMetrics.downsampleFailures.WithLabelValues(groupKey) + resolutionLabel := meta.Thanos.ResolutionString() + downsampleMetrics.downsamples.WithLabelValues(resolutionLabel) + downsampleMetrics.downsampleFailures.WithLabelValues(resolutionLabel) } if err := downsampleBucket( diff --git a/cmd/thanos/downsample.go b/cmd/thanos/downsample.go index ec84fc3d35..a5cd5c38ed 100644 --- a/cmd/thanos/downsample.go +++ b/cmd/thanos/downsample.go @@ -50,16 +50,16 @@ func newDownsampleMetrics(reg *prometheus.Registry) *DownsampleMetrics { m.downsamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "thanos_compact_downsample_total", Help: "Total number of downsampling attempts.", - }, []string{"group"}) + }, []string{"resolution"}) m.downsampleFailures = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "thanos_compact_downsample_failures_total", Help: "Total number of failed downsampling attempts.", - }, []string{"group"}) + }, []string{"resolution"}) m.downsampleDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ Name: "thanos_compact_downsample_duration_seconds", Help: "Duration of downsample runs", Buckets: []float64{60, 300, 900, 1800, 3600, 7200, 14400}, // 1m, 5m, 15m, 30m, 60m, 120m, 240m - }, []string{"group"}) + }, []string{"resolution"}) return m } @@ -130,9 +130,9 @@ func RunDownsample( } for _, meta := range metas { - groupKey := meta.Thanos.GroupKey() - metrics.downsamples.WithLabelValues(groupKey) - metrics.downsampleFailures.WithLabelValues(groupKey) + resolutionLabel := meta.Thanos.ResolutionString() + metrics.downsamples.WithLabelValues(resolutionLabel) + metrics.downsampleFailures.WithLabelValues(resolutionLabel) } if err := downsampleBucket(ctx, logger, metrics, insBkt, metas, dataDir, downsampleConcurrency, blockFilesConcurrency, hashFunc, false); err != nil { return errors.Wrap(err, "downsampling failed") @@ -263,11 +263,11 @@ func downsampleBucket( errMsg = "downsampling to 60 min" } if err := processDownsampling(workerCtx, logger, bkt, m, dir, resolution, hashFunc, metrics, acceptMalformedIndex, blockFilesConcurrency); err != nil { - metrics.downsampleFailures.WithLabelValues(m.Thanos.GroupKey()).Inc() + metrics.downsampleFailures.WithLabelValues(m.Thanos.ResolutionString()).Inc() errCh <- errors.Wrap(err, errMsg) } - metrics.downsamples.WithLabelValues(m.Thanos.GroupKey()).Inc() + metrics.downsamples.WithLabelValues(m.Thanos.ResolutionString()).Inc() } }() } @@ -391,7 +391,7 @@ func processDownsampling( downsampleDuration := time.Since(begin) level.Info(logger).Log("msg", "downsampled block", "from", m.ULID, "to", id, "duration", downsampleDuration, "duration_ms", downsampleDuration.Milliseconds()) - metrics.downsampleDuration.WithLabelValues(m.Thanos.GroupKey()).Observe(downsampleDuration.Seconds()) + metrics.downsampleDuration.WithLabelValues(m.Thanos.ResolutionString()).Observe(downsampleDuration.Seconds()) stats, err := block.GatherIndexHealthStats(ctx, logger, filepath.Join(resdir, block.IndexFilename), m.MinTime, m.MaxTime) if err == nil { diff --git a/cmd/thanos/main_test.go b/cmd/thanos/main_test.go index 1ced04637b..fcc394a614 100644 --- a/cmd/thanos/main_test.go +++ b/cmd/thanos/main_test.go @@ -157,7 +157,7 @@ func TestRegression4960_Deadlock(t *testing.T) { testutil.Ok(t, err) metrics := newDownsampleMetrics(prometheus.NewRegistry()) - testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey()))) + testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString()))) baseBlockIDsFetcher := block.NewConcurrentLister(logger, bkt) metaFetcher, err := block.NewMetaFetcher(nil, block.FetcherConcurrency, bkt, baseBlockIDsFetcher, "", nil, nil) testutil.Ok(t, err) @@ -197,7 +197,7 @@ func TestCleanupDownsampleCacheFolder(t *testing.T) { testutil.Ok(t, err) metrics := newDownsampleMetrics(prometheus.NewRegistry()) - testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey()))) + testutil.Equals(t, 0.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString()))) baseBlockIDsFetcher := block.NewConcurrentLister(logger, bkt) metaFetcher, err := block.NewMetaFetcher(nil, block.FetcherConcurrency, bkt, baseBlockIDsFetcher, "", nil, nil) testutil.Ok(t, err) @@ -205,7 +205,7 @@ func TestCleanupDownsampleCacheFolder(t *testing.T) { metas, _, err := metaFetcher.Fetch(ctx) testutil.Ok(t, err) testutil.Ok(t, downsampleBucket(ctx, logger, metrics, bkt, metas, dir, 1, 1, metadata.NoneFunc, false)) - testutil.Equals(t, 1.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.GroupKey()))) + testutil.Equals(t, 1.0, promtest.ToFloat64(metrics.downsamples.WithLabelValues(meta.Thanos.ResolutionString()))) _, err = os.Stat(dir) testutil.Assert(t, os.IsNotExist(err), "index cache dir should not exist at the end of execution")