thanos-io · yeya24 · Feb 26, 2024 · Feb 5, 2024 · Feb 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
 - [#7080](https://github.com/thanos-io/thanos/pull/7080) Receive: race condition in handler Close() when stopped early
 - [#7132](https://github.com/thanos-io/thanos/pull/7132) Documentation: fix broken helm installation instruction
 - [#7134](https://github.com/thanos-io/thanos/pull/7134) Store, Compact: Revert the recursive block listing mechanism introduced in https://github.com/thanos-io/thanos/pull/6474 and use the same strategy as in 0.31. Introduce a `--block-discovery-strategy` flag to control the listing strategy so that a recursive lister can still be used if the tradeoff of slower but cheaper discovery is preferred.
+- [#7122](https://github.com/thanos-io/thanos/pull/7122) Store Gateway: Fix lazy expanded postings estimate base cardinality using posting group with remove keys.
 
 ### Added
 - [#7105](https://github.com/thanos-io/thanos/pull/7105) Rule: add flag `--query.enable-x-functions` to allow usage of extended promql functions (xrate, xincrease, xdelta) in loaded rules

diff --git a/pkg/store/lazy_postings.go b/pkg/store/lazy_postings.go
@@ -81,7 +81,7 @@ func optimizePostingsFetchByDownloadedBytes(r *bucketIndexReader, postingGroups
 	   Sort posting groups by cardinality, so we can iterate from posting group with the smallest posting size.
 	   The algorithm focuses on fetching fewer data, including postings and series.
 
-	   We need to fetch at least 1 posting group in order to fetch series. So if we only fetch the first posting group,
+	   We need to fetch at least 1 posting group with add keys in order to fetch series. So if we only fetch the first posting group with add keys,
 	   the data bytes we need to download is formula F1: P1 * 4 + P1 * S where P1 is the number of postings in group 1
 	   and S is the size per series. 4 is the byte size per posting.
 
@@ -117,20 +117,56 @@ func optimizePostingsFetchByDownloadedBytes(r *bucketIndexReader, postingGroups
 	   Based on formula Pn * 4 < P1 * S * R^(n - 2) * (1 - R), left hand side is always increasing while iterating to larger
 	   posting groups while right hand side value is always decreasing as R < 1.
 	*/
-	seriesBytesToFetch := postingGroups[0].cardinality * seriesMaxSize
-	p := float64(1)
-	i := 1 // Start from index 1 as we always need to fetch the smallest posting group.
-	hasAdd := !postingGroups[0].addAll
+	var (
+		negativeCardinalities int64
+		i                     int
+	)
+	for i = range postingGroups {
+		if postingGroups[i].addAll {
+			negativeCardinalities += postingGroups[i].cardinality
+			continue
+		}
+		break
+	}
+	// If the first posting group with add keys is already the last posting group
+	// then there is no need to set up lazy expanded posting groups.
+	if i >= len(postingGroups)-1 {
+		return postingGroups, false, nil
+	}
+
+	// Assume only seriesMatchRatio postings will be matched every posting group.
+	seriesMatched := postingGroups[i].cardinality - int64(math.Ceil(float64(negativeCardinalities)*seriesMatchRatio))
+	i++ // Start from next posting group as we always need to fetch at least one posting group with add keys.
 	for i < len(postingGroups) {
 		pg := postingGroups[i]
-		// Need to fetch more data on postings than series we avoid fetching, stop here and lazy expanding rest of matchers.
-		// If there is no posting group with add keys, don't skip any posting group until we have one.
-		// Fetch posting group with addAll is much more expensive due to fetch all postings.
-		if hasAdd && pg.cardinality*4 > int64(p*math.Ceil((1-seriesMatchRatio)*float64(seriesBytesToFetch))) {
+		var (
+			underfetchedSeries     int64
+			underfetchedSeriesSize int64
+		)
+		// Unlikely but keep it as a safeguard. When estimated matching series
+		// is <= 0, we stop early and mark rest of posting groups as lazy.
+		if seriesMatched <= 0 {
+			break
+		}
+		if pg.addAll {
+			// For posting group that has negative matchers, we assume we can underfetch
+			// min(pg.cardinality, current_series_matched) * match ratio series.
+			if pg.cardinality > seriesMatched {
+				underfetchedSeries = int64(math.Ceil(float64(seriesMatched) * seriesMatchRatio))
+			} else {
+				underfetchedSeries = int64(math.Ceil(float64(pg.cardinality) * seriesMatchRatio))
+			}
+			seriesMatched -= underfetchedSeries
+			underfetchedSeriesSize = underfetchedSeries * seriesMaxSize
+		} else {
+			underfetchedSeriesSize = seriesMaxSize * int64(math.Ceil(float64(seriesMatched)*(1-seriesMatchRatio)))
+			seriesMatched = int64(math.Ceil(float64(seriesMatched) * seriesMatchRatio))
+		}
+
+		// Need to fetch more data on postings than series we underfetch, stop here and lazy expanding rest of matchers.
+		if pg.cardinality*4 > underfetchedSeriesSize {
 			break
 		}
-		hasAdd = hasAdd || !pg.addAll
-		p = p * seriesMatchRatio
 		i++
 	}
 	for i < len(postingGroups) {

diff --git a/pkg/store/lazy_postings_test.go b/pkg/store/lazy_postings_test.go
@@ -479,6 +479,29 @@ func TestOptimizePostingsFetchByDownloadedBytes(t *testing.T) {
 				{addAll: true, name: "bar", removeKeys: []string{"foo"}, cardinality: 250000, lazy: true},
 			},
 		},
+		{
+			name: "four posting groups with either add or remove keys, negative matcher group has the lowest cardinality, only the largest group is lazy",
+			inputPostings: map[string]map[string]index.Range{
+				"foo":     {"bar": index.Range{End: 8}},
+				"bar":     {"foo": index.Range{Start: 8, End: 2012}},
+				"baz":     {"foo": index.Range{Start: 2012, End: 4020}},
+				"cluster": {"us": index.Range{Start: 4020, End: 1004024}},
+			},
+			seriesMaxSize:    1000,
+			seriesMatchRatio: 0.5,
+			postingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}},
+				{name: "bar", addKeys: []string{"foo"}},
+				{name: "baz", addKeys: []string{"foo"}},
+				{name: "cluster", addKeys: []string{"us"}},
+			},
+			expectedPostingGroups: []*postingGroup{
+				{addAll: true, name: "foo", removeKeys: []string{"bar"}, cardinality: 1},
+				{name: "bar", addKeys: []string{"foo"}, cardinality: 500},
+				{name: "baz", addKeys: []string{"foo"}, cardinality: 501},
+				{name: "cluster", addKeys: []string{"us"}, cardinality: 250000, lazy: true},
+			},
+		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			headerReader := &mockIndexHeaderReader{postings: tc.inputPostings, err: tc.inputError}