From 47a990e7f78de8639af27afe3abbfd734e40ebf3 Mon Sep 17 00:00:00 2001 From: tpp <146148086+terry1purcell@users.noreply.github.com> Date: Wed, 13 Mar 2024 06:30:41 -0700 Subject: [PATCH] planner: adjust estimated rows to account for modified rows (#50970) close pingcap/tidb#47523 --- pkg/planner/cardinality/row_count_column.go | 14 ++--- pkg/planner/cardinality/row_count_index.go | 27 ++++++---- .../testdata/cardinality_suite_out.json | 6 +-- pkg/statistics/histogram.go | 51 +++++++++---------- 4 files changed, 53 insertions(+), 45 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index a57b5f7873cea..d36be0aaf7ed4 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -282,15 +282,17 @@ func GetColumnRowCount(sctx context.PlanContext, c *statistics.Column, ranges [] cnt = mathutil.Clamp(cnt, 0, c.TotalRowCount()) // If the current table row count has changed, we should scale the row count accordingly. - cnt *= c.GetIncreaseFactor(realtimeRowCount) + increaseFactor := c.GetIncreaseFactor(realtimeRowCount) + cnt *= increaseFactor - histNDV := c.NDV - if c.StatsVer == statistics.Version2 { - histNDV = histNDV - int64(c.TopN.Num()) - } // handling the out-of-range part if (c.OutOfRange(lowVal) && !lowVal.IsNull()) || c.OutOfRange(highVal) { - cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount, histNDV) + histNDV := c.NDV + // Exclude the TopN + if c.StatsVer == statistics.Version2 { + histNDV -= int64(c.TopN.Num()) + } + cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount, histNDV, increaseFactor) } if debugTrace { diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 738fa5580fe92..a7ec51310752f 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -222,7 +222,7 @@ func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, defer debugtrace.LeaveContextCommon(sctx) } totalCount := float64(0) - isSingleCol := len(idx.Info.Columns) == 1 + isSingleColIdx := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { var count float64 lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) @@ -278,7 +278,7 @@ func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, l := types.NewBytesDatum(lb) r := types.NewBytesDatum(rb) lowIsNull := bytes.Equal(lb, nullKeyBytes) - if isSingleCol && lowIsNull { + if isSingleColIdx && lowIsNull { count += float64(idx.Histogram.NullCount) } expBackoffSuccess := false @@ -325,15 +325,24 @@ func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, } // If the current table row count has changed, we should scale the row count accordingly. - count *= idx.GetIncreaseFactor(realtimeRowCount) + increaseFactor := idx.GetIncreaseFactor(realtimeRowCount) + count *= increaseFactor - histNDV := idx.NDV - if idx.StatsVer == statistics.Version2 { - histNDV = histNDV - int64(idx.TopN.Num()) - } // handling the out-of-range part - if (outOfRangeOnIndex(idx, l) && !(isSingleCol && lowIsNull)) || outOfRangeOnIndex(idx, r) { - count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV) + if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) { + histNDV := idx.NDV + // Exclude the TopN in Stats Version 2 + if idx.StatsVer == statistics.Version2 { + c, ok := coll.Columns[idx.Histogram.ID] + // If this is single column of a multi-column index - use the column's NDV rather than index NDV + isSingleColRange := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == 1 + if isSingleColRange && !isSingleColIdx && ok && c != nil && c.Histogram.NDV > 0 { + histNDV = c.Histogram.NDV - int64(c.TopN.Num()) + } else { + histNDV -= int64(idx.TopN.Num()) + } + } + count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV, increaseFactor) } if debugTrace { diff --git a/pkg/planner/cardinality/testdata/cardinality_suite_out.json b/pkg/planner/cardinality/testdata/cardinality_suite_out.json index dfd9563c71c83..e5b74e88452a0 100644 --- a/pkg/planner/cardinality/testdata/cardinality_suite_out.json +++ b/pkg/planner/cardinality/testdata/cardinality_suite_out.json @@ -24,7 +24,7 @@ { "Start": 800, "End": 900, - "Count": 723.504166655054 + "Count": 735.504166655054 }, { "Start": 900, @@ -79,7 +79,7 @@ { "Start": 800, "End": 1000, - "Count": 1181.696869573942 + "Count": 1193.696869573942 }, { "Start": 900, @@ -104,7 +104,7 @@ { "Start": 200, "End": 400, - "Count": 1190.2788209899081 + "Count": 1237.5288209899081 }, { "Start": 200, diff --git a/pkg/statistics/histogram.go b/pkg/statistics/histogram.go index 662889459d1e4..642110ee2621f 100644 --- a/pkg/statistics/histogram.go +++ b/pkg/statistics/histogram.go @@ -937,7 +937,7 @@ func (hg *Histogram) OutOfRange(val types.Datum) bool { func (hg *Histogram) OutOfRangeRowCount( sctx context.PlanContext, lDatum, rDatum *types.Datum, - modifyCount, histNDV int64, + modifyCount, histNDV int64, increaseFactor float64, ) (result float64) { debugTrace := sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace if debugTrace { @@ -1052,38 +1052,35 @@ func (hg *Histogram) OutOfRangeRowCount( rightPercent = (math.Pow(boundR-actualL, 2) - math.Pow(boundR-actualR, 2)) / math.Pow(histWidth, 2) } - totalPercent := leftPercent*0.5 + rightPercent*0.5 - if totalPercent > 1 { - totalPercent = 1 - } + totalPercent := min(leftPercent*0.5+rightPercent*0.5, 1.0) rowCount = totalPercent * hg.NotNullCount() - // Upper bound logic + // Upper & lower bound logic. + upperBound := rowCount + if histNDV > 0 { + upperBound = hg.NotNullCount() / float64(histNDV) + } allowUseModifyCount := sctx.GetSessionVars().GetOptObjective() != variable.OptObjectiveDeterminate - // Use the modifyCount as the upper bound. Note that modifyCount contains insert, delete and update. So this is - // a rather loose upper bound. - // There are some scenarios where we need to handle out-of-range estimation after both insert and delete happen. - // But we don't know how many increases are in the modifyCount. So we have to use this loose bound to ensure it - // can produce a reasonable results in this scenario. - if rowCount > float64(modifyCount) && allowUseModifyCount { - return float64(modifyCount) - } - - // In OptObjectiveDeterminate mode, we can't rely on the modify count anymore. - // An upper bound is necessary to make the estimation make sense for predicates with bound on only one end, like a > 1. - // But it's impossible to have a reliable upper bound in all cases. - // We use 1/NDV here (only the Histogram part is considered) and it seems reasonable and good enough for now. + if !allowUseModifyCount { - var upperBound float64 - if histNDV > 0 { - upperBound = hg.NotNullCount() / float64(histNDV) - } - if rowCount > upperBound { - return upperBound - } + // In OptObjectiveDeterminate mode, we can't rely on the modify count anymore. + // An upper bound is necessary to make the estimation make sense for predicates with bound on only one end, like a > 1. + // We use 1/NDV here (only the Histogram part is considered) and it seems reasonable and good enough for now. + return min(rowCount, upperBound) + } + + // If the modifyCount is large (compared to original table rows), then any out of range estimate is unreliable. + // Assume at least 1/NDV is returned + if float64(modifyCount) > hg.NotNullCount() && rowCount < upperBound { + rowCount = upperBound + } else if rowCount < upperBound { + // Adjust by increaseFactor if our estimate is low + rowCount *= increaseFactor } - return rowCount + + // Use modifyCount as a final bound + return min(rowCount, float64(modifyCount)) } // Copy deep copies the histogram.