From df0db2d9391f862c1f3beeec71abf040148d69db Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 22 Jul 2021 21:27:25 +0800 Subject: [PATCH 01/20] use a new method to estimate out-of-range row count --- statistics/handle/update.go | 2 - statistics/histogram.go | 223 +++++++++++++++++++++++++++++------- statistics/scalar.go | 10 ++ statistics/table.go | 33 +++--- 4 files changed, 207 insertions(+), 61 deletions(-) diff --git a/statistics/handle/update.go b/statistics/handle/update.go index 7a1b4fc40e589..b45853319d927 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -1248,11 +1248,9 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error { if isIndex { idx := t.Indices[id] expected, err = idx.GetRowCount(sc, nil, ranges, t.ModifyCount) - expected *= idx.GetIncreaseFactor(t.Count) } else { c := t.Columns[id] expected, err = c.GetColumnRowCount(sc, ranges, t.ModifyCount, true) - expected *= c.GetIncreaseFactor(t.Count) } q.Expected = int64(expected) return err diff --git a/statistics/histogram.go b/statistics/histogram.go index 80d61b0c3bdf5..f485e3d0a2bb7 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -832,6 +832,99 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { chunk.Compare(hg.Bounds.GetRow(hg.Bounds.NumRows()-1), 0, &val) < 0 } +// outOfRangeRowCount estimate the row count of part of [lDatum, rDatum] which is out of range of the histogram. +// Here we assume the density of data is decreasing from the lower/upper bound of the histogram toward outside. +// The maximum row count it can get is the increaseCount. It reaches the maximum when out-of-range width reaches histogram range width. +// As it shows below. To calculate the out-of-range row count, we need to calculate the percentage of the shaded area. +// +// /│ │\ +// / │ │ \ +// /x│ │◄─histogram─►│ \ +// / xx│ │ range │ \ +// / │xxx│ │ │ \ +// / │xxx│ │ │ \ +//────┴────┴───┴──┴─────────────┴───────────┴───── +// ▲ ▲ ▲ ▲ ▲ ▲ +// │ │ │ │ │ │ +// boundL │ │histL histR boundR +// │ │ +// lDatum rDatum +func (hg *Histogram) outOfRangeRowCount(sc *stmtctx.StatementContext, lDatum, rDatum *types.Datum, increaseCount int64) float64 { + if hg.Len() == 0 { + return 0 + } + // make sure l < r + cmp, err := lDatum.CompareDatum(sc, rDatum) + if err != nil || cmp >= 0 { + return 0 + } + + // For bytes and string type, we need to cut the common prefix when converting them to scalar value. + // Here we calculate the length of common prefix. + commonPrefix := 0 + if hg.GetLower(0).Kind() == types.KindBytes || hg.GetLower(0).Kind() == types.KindString { + // Calculate the common prefix length among the lower and upper bound of histogram and the range we want to estimate. + commonPrefix = commonPrefixLength(hg.GetLower(0).GetBytes(), hg.GetUpper(hg.Len()-1).GetBytes()) + commonPrefix2 := commonPrefixLength(lDatum.GetBytes(), rDatum.GetBytes()) + if commonPrefix2 < commonPrefix { + commonPrefix = commonPrefix2 + } + commonPrefix3 := commonPrefixLength(hg.GetLower(0).GetBytes(), lDatum.GetBytes()) + if commonPrefix3 < commonPrefix { + commonPrefix = commonPrefix3 + } + } + + // Convert the range we want to estimate to scalar value(float64) + l := convertDatumToScalar(lDatum, commonPrefix) + r := convertDatumToScalar(rDatum, commonPrefix) + // Convert the lower and upper bound of the histogram to scalar value(float64) + histL := convertDatumToScalar(hg.GetLower(0), commonPrefix) + histR := convertDatumToScalar(hg.GetUpper(hg.Len()-1), commonPrefix) + histWidth := histR - histL + if histWidth <= 0 { + return 0 + } + boundL := histL - histWidth + boundR := histR + histWidth + + actualL := l + actualR := r + leftPercent := float64(0) + rightPercent := float64(0) + // Handling the out-of-range part on the left of the histogram range + if l < histL && r > boundL { + if l < boundL { + actualL = boundL + } + if r > histL { + actualR = histL + } + // Calculate the percentage of "the shaded area" on the left side. + leftPercent = (math.Pow(actualR-boundL, 2) - math.Pow(actualL-boundL, 2)) / math.Pow(histWidth, 2) + } + + actualL = l + actualR = r + // Handling the out-of-range part on the right of the histogram range + if l < boundR && r > histR { + if l < histR { + actualL = histR + } + if r > boundR { + r = boundR + } + // Calculate the percentage of "the shaded area" on the right side. + rightPercent = (math.Pow(boundR-actualL, 2) - math.Pow(boundR-actualR, 2)) / math.Pow(histWidth, 2) + } + + totalPercent := leftPercent + rightPercent + if totalPercent > 1 { + totalPercent = 1 + } + return totalPercent * float64(increaseCount) +} + // Copy deep copies the histogram. func (hg *Histogram) Copy() *Histogram { newHist := *hg @@ -967,7 +1060,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0) } -func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, modifyCount int64) (float64, error) { +func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, tableRowCount int64) (float64, error) { if val.IsNull() { return float64(c.NullCount), nil } @@ -977,7 +1070,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return 0.0, nil } if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) { - return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil + return outOfRangeEQSelectivity(c.Histogram.NDV, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { count, err := queryValue(sc, c.CMSketch, c.TopN, val) @@ -985,14 +1078,12 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en } return c.Histogram.equalRowCount(val, false), nil } + + // Stats version == 2 // All the values are null. if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 { return 0, nil } - if c.Histogram.NDV+int64(c.TopN.Num()) > 0 && c.outOfRange(val, encodedVal) { - return outOfRangeEQSelectivity(c.Histogram.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil - } - // Stats version == 2 // 1. try to find this value in TopN if c.TopN != nil { rowcount, ok := c.QueryTopN(encodedVal) @@ -1011,7 +1102,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return float64(c.Histogram.Buckets[index/2].Repeat), nil } } - // 3. use uniform distribution assumption for the rest + // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) cnt := c.Histogram.notNullCount() for _, bkt := range c.Histogram.Buckets { if cnt <= float64(bkt.Repeat) { @@ -1031,7 +1122,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en } // GetColumnRowCount estimates the row count by a slice of Range. -func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64, pkIsHandle bool) (float64, error) { +func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, tableRowCount int64, pkIsHandle bool) (float64, error) { var rowCount float64 for _, rg := range ranges { highVal := *rg.HighVal[0].Clone() @@ -1055,7 +1146,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range return 0, err } if cmp == 0 { - // the point case. + // case 1: it's a point if !rg.LowExclude && !rg.HighExclude { // In this case, the row count is at most 1. if pkIsHandle { @@ -1063,36 +1154,40 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range continue } var cnt float64 - cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, modifyCount) + cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, tableRowCount) if err != nil { return 0, errors.Trace(err) } + // If the current table row count has changed, we should scale the row count accordingly. + cnt *= c.GetIncreaseFactor(tableRowCount) rowCount += cnt } continue } rangeVals := enumRangeValues(lowVal, highVal, rg.LowExclude, rg.HighExclude) - // The small range case. + + // case 2: it's a small range if rangeVals != nil { for _, val := range rangeVals { - cnt, err := c.equalRowCount(sc, val, lowEncoded, modifyCount) + cnt, err := c.equalRowCount(sc, val, lowEncoded, tableRowCount) if err != nil { return 0, err } + // If the current table row count has changed, we should scale the row count accordingly. + cnt *= c.GetIncreaseFactor(tableRowCount) rowCount += cnt } + continue } - // The interval case. + + // case 3: it's a interval cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded) - if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) { - cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount() - } - // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. + // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boundaries here. // Note that, `cnt` does not include null values, we need specially handle cases // where null is the lower bound. if rg.LowExclude && !lowVal.IsNull() { - lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, modifyCount) + lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, tableRowCount) if err != nil { return 0, errors.Trace(err) } @@ -1102,16 +1197,39 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range cnt += float64(c.NullCount) } if !rg.HighExclude { - highCnt, err := c.equalRowCount(sc, highVal, highEncoded, modifyCount) + highCnt, err := c.equalRowCount(sc, highVal, highEncoded, tableRowCount) if err != nil { return 0, errors.Trace(err) } cnt += highCnt } + + if cnt > c.TotalRowCount() { + cnt = c.TotalRowCount() + } else if cnt < 0 { + cnt = 0 + } + + // If the current table row count has changed, we should scale the row count accordingly. + cnt *= c.GetIncreaseFactor(tableRowCount) + + // handling the out-of-range part + if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) { + if c.StatsVer < 2 { + cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount() + } else { + increaseCount := tableRowCount - int64(c.TotalRowCount()) + if increaseCount < 0 { + increaseCount = 0 + } + cnt += c.Histogram.outOfRangeRowCount(sc, &lowVal, &highVal, increaseCount) + } + } + rowCount += cnt } - if rowCount > c.TotalRowCount() { - rowCount = c.TotalRowCount() + if rowCount > float64(tableRowCount) { + rowCount = float64(tableRowCount) } else if rowCount < 0 { rowCount = 0 } @@ -1169,28 +1287,34 @@ func (idx *Index) MemoryUsage() (sum int64) { var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewDatum(nil)) -func (idx *Index) equalRowCount(b []byte, modifyCount int64) float64 { +func (idx *Index) equalRowCount(b []byte, tableRowCount int64) float64 { if len(idx.Info.Columns) == 1 { if bytes.Equal(b, nullKeyBytes) { return float64(idx.NullCount) } } val := types.NewBytesDatum(b) - if idx.NDV > 0 && idx.outOfRange(val) { - return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() + if idx.StatsVer < Version2 { + if idx.NDV > 0 && idx.outOfRange(val) { + return outOfRangeEQSelectivity(idx.NDV, tableRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() + } + if idx.CMSketch != nil { + return float64(idx.QueryBytes(b)) + } + return idx.Histogram.equalRowCount(val, false) } - if idx.CMSketch != nil && idx.StatsVer < Version2 { - return float64(idx.QueryBytes(b)) + // stats version == 2 + // query the top-n first. + count, found := idx.TopN.QueryTopN(b) + if found { + return float64(count) } - // If it's version2, query the top-n first. - if idx.StatsVer >= Version2 { - count, found := idx.TopN.QueryTopN(b) - if found { - return float64(count) - } - return idx.Histogram.equalRowCount(val, true) + histCnt := idx.Histogram.equalRowCount(val, true) + if histCnt > 0 { + return histCnt } - return idx.Histogram.equalRowCount(val, false) + // the out-of-range case: + return idx.notNullCount() / float64(idx.NDV) } // QueryBytes is used to query the count of specified bytes. @@ -1204,7 +1328,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 { // GetRowCount returns the row count of the given ranges. // It uses the modifyCount to adjust the influence of modifications on the table. -func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { +func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, tableRowCount int64) (float64, error) { totalCount := float64(0) isSingleCol := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { @@ -1218,6 +1342,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde } fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns) if bytes.Equal(lb, rb) { + // case 1: it's a point if indexRange.LowExclude || indexRange.HighExclude { continue } @@ -1227,11 +1352,15 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde totalCount += 1 continue } - count := idx.equalRowCount(lb, modifyCount) + count := idx.equalRowCount(lb, tableRowCount) + // If the current table row count has changed, we should scale the row count accordingly. + count *= idx.GetIncreaseFactor(tableRowCount) totalCount += count continue } } + + // case 2: it's a interval // The final interval is [low, high) if indexRange.LowExclude { lb = kv.Key(lb).PrefixNext() @@ -1242,9 +1371,6 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde l := types.NewBytesDatum(lb) r := types.NewBytesDatum(rb) lowIsNull := bytes.Equal(lb, nullKeyBytes) - if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() - } if isSingleCol && lowIsNull { totalCount += float64(idx.NullCount) } @@ -1264,9 +1390,24 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde if !expBackoffSuccess { totalCount += idx.BetweenRowCount(l, r) } + + // If the current table row count has changed, we should scale the row count accordingly. + totalCount *= idx.GetIncreaseFactor(tableRowCount) + // handling the out-of-range part + if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { + if idx.StatsVer < 2 { + totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, tableRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() + } else { + increaseCount := tableRowCount - int64(idx.TotalRowCount()) + if increaseCount < 0 { + increaseCount = 0 + } + totalCount += idx.Histogram.outOfRangeRowCount(sc, &l, &r, increaseCount) + } + } } - if totalCount > idx.TotalRowCount() { - totalCount = idx.TotalRowCount() + if totalCount > float64(tableRowCount) { + totalCount = float64(tableRowCount) } return totalCount, nil } diff --git a/statistics/scalar.go b/statistics/scalar.go index 25d0736197777..5f4b42db85246 100644 --- a/statistics/scalar.go +++ b/statistics/scalar.go @@ -45,6 +45,16 @@ func calcFraction(lower, upper, value float64) float64 { func convertDatumToScalar(value *types.Datum, commonPfxLen int) float64 { switch value.Kind() { + case types.KindFloat32: + return float64(value.GetFloat32()) + case types.KindFloat64: + return value.GetFloat64() + case types.KindInt64: + return float64(value.GetInt64()) + case types.KindUint64: + return float64(value.GetUint64()) + case types.KindMysqlDuration: + return float64(value.GetMysqlDuration().Duration) case types.KindMysqlDecimal: scalar, err := value.GetMysqlDecimal().ToFloat64() if err != nil { diff --git a/statistics/table.go b/statistics/table.go index 85807fbefc67f..1953e840c0d9c 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -264,7 +264,7 @@ func (t *Table) ColumnGreaterRowCount(sc *stmtctx.StatementContext, value types. if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoLessRate } - return c.greaterRowCount(value) * c.GetIncreaseFactor(t.Count) + return c.greaterRowCount(value) } // ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted. @@ -273,7 +273,7 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoLessRate } - return c.lessRowCount(value) * c.GetIncreaseFactor(t.Count) + return c.lessRowCount(value) } // ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b. @@ -294,7 +294,7 @@ func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.D if a.IsNull() { count += float64(c.NullCount) } - return count * c.GetIncreaseFactor(t.Count), nil + return count, nil } // ColumnEqualRowCount estimates the row count where the column equals to value. @@ -308,7 +308,6 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da return 0, err } result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount) - result *= c.GetIncreaseFactor(t.Count) return result, errors.Trace(err) } @@ -324,8 +323,7 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, } return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil } - result, err := c.GetColumnRowCount(sc, intRanges, coll.ModifyCount, true) - result *= c.GetIncreaseFactor(coll.Count) + result, err := c.GetColumnRowCount(sc, intRanges, coll.Count, true) return result, errors.Trace(err) } @@ -335,8 +333,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, co if !ok || c.IsInvalid(sc, coll.Pseudo) { return GetPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0) } - result, err := c.GetColumnRowCount(sc, colRanges, coll.ModifyCount, false) - result *= c.GetIncreaseFactor(coll.Count) + result, err := c.GetColumnRowCount(sc, colRanges, coll.Count, false) return result, errors.Trace(err) } @@ -355,9 +352,8 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx if idx.CMSketch != nil && idx.StatsVer == Version1 { result, err = coll.getIndexRowCount(sc, idxID, indexRanges) } else { - result, err = idx.GetRowCount(sc, coll, indexRanges, coll.ModifyCount) + result, err = idx.GetRowCount(sc, coll, indexRanges, coll.Count) } - result *= idx.GetIncreaseFactor(coll.Count) return result, errors.Trace(err) } @@ -473,16 +469,17 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool { // It assumes all modifications are insertions and all new-inserted rows are uniformly distributed // and has the same distribution with analyzed rows, which means each unique value should have the // same number of rows(Tot/NDV) of it. -func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 { - if modifyRows == 0 { +func outOfRangeEQSelectivity(ndv, tableRowCount, totalRows int64) float64 { + increaseRowCount := tableRowCount - totalRows + if increaseRowCount <= 0 { return 0 // it must be 0 since the histogram contains the whole data } if ndv < outOfRangeBetweenRate { ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV } - selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. - if selectivity*float64(totalRows) > float64(modifyRows) { - selectivity = float64(modifyRows) / float64(totalRows) + selectivity := 1 / float64(ndv) + if selectivity*float64(totalRows) > float64(increaseRowCount) { + selectivity = float64(increaseRowCount) / float64(totalRows) } return selectivity } @@ -546,7 +543,7 @@ func (coll *HistColl) getEqualCondSelectivity(sc *stmtctx.StatementContext, idx // When the value is out of range, we could not found this value in the CM Sketch, // so we use heuristic methods to estimate the selectivity. if idx.NDV > 0 && coverAll { - return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount())), nil + return outOfRangeEQSelectivity(idx.NDV, coll.Count, int64(idx.TotalRowCount())), nil } // The equal condition only uses prefix columns of the index. colIDs := coll.Idx2ColumnIDs[idx.ID] @@ -559,7 +556,7 @@ func (coll *HistColl) getEqualCondSelectivity(sc *stmtctx.StatementContext, idx ndv = mathutil.MaxInt64(ndv, col.Histogram.NDV) } } - return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount())), nil + return outOfRangeEQSelectivity(ndv, coll.Count, int64(idx.TotalRowCount())), nil } minRowCount, crossValidationSelectivity, err := coll.crossValidationSelectivity(sc, idx, usedColsLen, idxPointRange) @@ -591,7 +588,7 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64 // on single-column index, use previous way as well, because CMSketch does not contain null // values in this case. if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { - count, err := idx.GetRowCount(sc, nil, []*ranger.Range{ran}, coll.ModifyCount) + count, err := idx.GetRowCount(sc, nil, []*ranger.Range{ran}, coll.Count) if err != nil { return 0, errors.Trace(err) } From c8f545a8c596abd94b19372ddcf50a2898e27f64 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 23 Jul 2021 02:02:38 +0800 Subject: [PATCH 02/20] fix tests --- planner/core/testdata/analyze_suite_out.json | 17 +++-- statistics/handle/update.go | 4 +- statistics/selectivity_test.go | 66 +++++++++++--------- statistics/statistics_test.go | 2 +- statistics/table.go | 2 +- statistics/testdata/stats_suite_out.json | 15 +++-- 6 files changed, 57 insertions(+), 49 deletions(-) diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json index cb0dd2137c515..416fa0940e412 100644 --- a/planner/core/testdata/analyze_suite_out.json +++ b/planner/core/testdata/analyze_suite_out.json @@ -60,8 +60,8 @@ "SQL": "explain format = 'brief' select * from t where a <= 5 and b <= 5", "RatioOfPseudoEstimate": 10, "Plan": [ - "TableReader 29.77 root data:Selection", - "└─Selection 29.77 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)", + "TableReader 28.80 root data:Selection", + "└─Selection 28.80 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)", " └─TableFullScan 80.00 cop[tikv] table:t keep order:false" ] }, @@ -454,18 +454,17 @@ { "SQL": "explain format = 'brief' select * from t where a = 7639902", "Plan": [ - "IndexReader 6.68 root index:IndexRangeScan", - "└─IndexRangeScan 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" + "IndexReader 5.95 root index:IndexRangeScan", + "└─IndexRangeScan 5.95 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" ] }, { "SQL": "explain format = 'brief' select c, b from t where a = 7639902 order by b asc limit 6", "Plan": [ - "Projection 6.00 root test.t.c, test.t.b", - "└─TopN 6.00 root test.t.b, offset:0, count:6", - " └─IndexReader 6.00 root index:TopN", - " └─TopN 6.00 cop[tikv] test.t.b, offset:0, count:6", - " └─IndexRangeScan 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" + "Projection 5.95 root test.t.c, test.t.b", + "└─TopN 5.95 root test.t.b, offset:0, count:6", + " └─IndexReader 5.95 root index:IndexRangeScan", + " └─IndexRangeScan 5.95 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" ] } ] diff --git a/statistics/handle/update.go b/statistics/handle/update.go index b45853319d927..537924b7cb5a1 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -1247,10 +1247,10 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error { expected := 0.0 if isIndex { idx := t.Indices[id] - expected, err = idx.GetRowCount(sc, nil, ranges, t.ModifyCount) + expected, err = idx.GetRowCount(sc, nil, ranges, t.Count) } else { c := t.Columns[id] - expected, err = c.GetColumnRowCount(sc, ranges, t.ModifyCount, true) + expected, err = c.GetColumnRowCount(sc, ranges, t.Count, true) } q.Expected = int64(expected) return err diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 3273f1b100914..b173cb5bc4db6 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -243,44 +243,54 @@ func (s *testStatsSuite) TestSelectivity(c *C) { longExpr += fmt.Sprintf(" and a > %d ", i) } tests := []struct { - exprs string - selectivity float64 + exprs string + selectivity float64 + selectivityAfterIncrease float64 }{ { - exprs: "a > 0 and a < 2", - selectivity: 0.01851851851, + exprs: "a > 0 and a < 2", + selectivity: 0.01851851851, + selectivityAfterIncrease: 0.01851851851, }, { - exprs: "a >= 1 and a < 2", - selectivity: 0.01851851851, + exprs: "a >= 1 and a < 2", + selectivity: 0.01851851851, + selectivityAfterIncrease: 0.01851851851, }, { - exprs: "a >= 1 and b > 1 and a < 2", - selectivity: 0.01783264746, + exprs: "a >= 1 and b > 1 and a < 2", + selectivity: 0.01783264746, + selectivityAfterIncrease: 0.01803635116, }, { - exprs: "a >= 1 and c > 1 and a < 2", - selectivity: 0.00617283950, + exprs: "a >= 1 and c > 1 and a < 2", + selectivity: 0.00617283950, + selectivityAfterIncrease: 0.00619135802, }, { - exprs: "a >= 1 and c >= 1 and a < 2", - selectivity: 0.01234567901, + exprs: "a >= 1 and c >= 1 and a < 2", + selectivity: 0.01234567901, + selectivityAfterIncrease: 0.01236419753, }, { - exprs: "d = 0 and e = 1", - selectivity: 0.11111111111, + exprs: "d = 0 and e = 1", + selectivity: 0.11111111111, + selectivityAfterIncrease: 0.11111111111, }, { - exprs: "b > 1", - selectivity: 0.96296296296, + exprs: "b > 1", + selectivity: 0.96296296296, + selectivityAfterIncrease: 0.97396296296, }, { - exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", - selectivity: 0, + exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", + selectivity: 0, + selectivityAfterIncrease: 0.00003333788, }, { - exprs: longExpr, - selectivity: 0.001, + exprs: longExpr, + selectivity: 0.001, + selectivityAfterIncrease: 0.001, }, } @@ -311,7 +321,7 @@ func (s *testStatsSuite) TestSelectivity(c *C) { histColl.Count *= 10 ratio, _, err = histColl.Selectivity(sctx, sel.Conditions, nil) c.Assert(err, IsNil, comment) - c.Assert(math.Abs(ratio-tt.selectivity) < eps, IsTrue, Commentf("for %s, needed: %v, got: %v", tt.exprs, tt.selectivity, ratio)) + c.Assert(math.Abs(ratio-tt.selectivityAfterIncrease) < eps, IsTrue, Commentf("for %s, needed: %v, got: %v", tt.exprs, tt.selectivityAfterIncrease, ratio)) } } @@ -389,14 +399,14 @@ func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) { statsTbl := h.GetTableStats(table.Meta()) sc := &stmtctx.StatementContext{} col := statsTbl.Columns[table.Meta().Columns[0].ID] - count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false) + count, err := col.GetColumnRowCount(sc, getRange(250, 250), statsTbl.Count, false) c.Assert(err, IsNil) c.Assert(count, Equals, float64(0)) for i := 0; i < 8; i++ { - count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false) + count, err := col.GetColumnRowCount(sc, getRange(250, 250), statsTbl.Count+int64(i)+1, false) c.Assert(err, IsNil) - c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt + c.Assert(count < math.Min(float64(i+1), 4), IsTrue) // estRows must be less than modifyCnt } } @@ -431,20 +441,20 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.4000000000000004) + c.Assert(count, Equals, 2.3000000000000003) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.4000000000000004) + c.Assert(count, Equals, 2.3000000000000003) idxID := table.Meta().Indices[0].ID count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 0.2) + c.Assert(count, Equals, 0.1) count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.2) + c.Assert(count, Equals, 2.1) testKit.MustExec("truncate table t") testKit.MustExec("insert into t values (null, null)") diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index 2b9f9771ed12a..f1de62b07cd29 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -639,7 +639,7 @@ func (s *testStatisticsSuite) TestIntColumnRanges(c *C) { tbl.Count *= 10 count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran) c.Assert(err, IsNil) - c.Assert(int(count), Equals, 10) + c.Assert(int(count), Equals, 1) } func (s *testStatisticsSuite) TestIndexRanges(c *C) { diff --git a/statistics/table.go b/statistics/table.go index 1953e840c0d9c..ecadda9d5a5d6 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -517,7 +517,7 @@ func (coll *HistColl) crossValidationSelectivity(sc *stmtctx.StatementContext, i HighExclude: highExclude, } - rowCount, err := col.GetColumnRowCount(sc, []*ranger.Range{&rang}, coll.ModifyCount, col.IsHandle) + rowCount, err := col.GetColumnRowCount(sc, []*ranger.Range{&rang}, coll.Count, col.IsHandle) if err != nil { return 0, 0, err } diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index c70e967949881..6850d68170069 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -240,10 +240,9 @@ " └─TableFullScan_5 6.00 cop[tikv] table:tprefix keep order:false" ], [ - "IndexLookUp_11 0.00 root ", - "├─IndexRangeScan_8(Build) 0.00 cop[tikv] table:tprefix, index:prefixa(a) range:[\"88\",\"88\"], keep order:false", - "└─Selection_10(Probe) 0.00 cop[tikv] eq(test.tprefix.a, \"888\")", - " └─TableRowIDScan_9 0.00 cop[tikv] table:tprefix keep order:false" + "TableReader_7 0.67 root data:Selection_6", + "└─Selection_6 0.67 cop[tikv] eq(test.tprefix.a, \"888\")", + " └─TableFullScan_5 6.00 cop[tikv] table:tprefix keep order:false" ], [ "TableReader_7 1.00 root data:Selection_6", @@ -457,16 +456,16 @@ "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false" ], [ - "IndexReader_6 0.00 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false" + "IndexReader_6 0.60 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 0.60 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false" ], [ "IndexReader_6 4.00 root index:IndexRangeScan_5", "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false" ], [ - "IndexReader_6 0.00 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 0.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" + "IndexReader_6 0.60 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 0.60 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" ], [ "TableReader_7 4.00 root data:Selection_6", From 336287b8ebd49fe201b19e60111acecad885ce54 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 23 Jul 2021 14:18:03 +0800 Subject: [PATCH 03/20] modify out-of-range check --- statistics/cmsketch.go | 8 ----- statistics/histogram.go | 72 +++++++++++++++++------------------------ 2 files changed, 29 insertions(+), 51 deletions(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 07d90434a6cc7..c510186b16c40 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -530,14 +530,6 @@ func (c *TopN) Num() int { return len(c.TopN) } -// outOfRange checks whether the the given value falls back in [TopN.LowestOne, TopN.HighestOne]. -func (c *TopN) outOfRange(val []byte) bool { - if c == nil || len(c.TopN) == 0 { - return true - } - return bytes.Compare(c.TopN[0].Encoded, val) > 0 || bytes.Compare(val, c.TopN[c.Num()-1].Encoded) > 0 -} - // DecodedString returns the value with decoded result. func (c *TopN) DecodedString(ctx sessionctx.Context, colTypes []byte) (string, error) { builder := &strings.Builder{} diff --git a/statistics/histogram.go b/statistics/histogram.go index f485e3d0a2bb7..2f8ed8b6a7ca0 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -826,7 +826,7 @@ func (hg *Histogram) AvgCountPerNotNullValue(totalCount int64) float64 { func (hg *Histogram) outOfRange(val types.Datum) bool { if hg.Len() == 0 { - return true + return false } return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 || chunk.Compare(hg.Bounds.GetRow(hg.Bounds.NumRows()-1), 0, &val) < 0 @@ -849,15 +849,10 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { // boundL │ │histL histR boundR // │ │ // lDatum rDatum -func (hg *Histogram) outOfRangeRowCount(sc *stmtctx.StatementContext, lDatum, rDatum *types.Datum, increaseCount int64) float64 { +func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCount int64) float64 { if hg.Len() == 0 { return 0 } - // make sure l < r - cmp, err := lDatum.CompareDatum(sc, rDatum) - if err != nil || cmp >= 0 { - return 0 - } // For bytes and string type, we need to cut the common prefix when converting them to scalar value. // Here we calculate the length of common prefix. @@ -878,6 +873,10 @@ func (hg *Histogram) outOfRangeRowCount(sc *stmtctx.StatementContext, lDatum, rD // Convert the range we want to estimate to scalar value(float64) l := convertDatumToScalar(lDatum, commonPrefix) r := convertDatumToScalar(rDatum, commonPrefix) + // make sure l < r + if l >= r { + return 0 + } // Convert the lower and upper bound of the histogram to scalar value(float64) histL := convertDatumToScalar(hg.GetLower(0), commonPrefix) histR := convertDatumToScalar(hg.GetUpper(hg.Len()-1), commonPrefix) @@ -888,16 +887,19 @@ func (hg *Histogram) outOfRangeRowCount(sc *stmtctx.StatementContext, lDatum, rD boundL := histL - histWidth boundR := histR + histWidth - actualL := l - actualR := r leftPercent := float64(0) rightPercent := float64(0) + + // keep l and r unchanged, use actualL and actualR to calculate. + actualL := l + actualR := r // Handling the out-of-range part on the left of the histogram range - if l < histL && r > boundL { - if l < boundL { + if actualL < histL && actualR > boundL { + // make sure boundL <= actualL < actualR <= histL + if actualL < boundL { actualL = boundL } - if r > histL { + if actualR > histL { actualR = histL } // Calculate the percentage of "the shaded area" on the left side. @@ -907,12 +909,13 @@ func (hg *Histogram) outOfRangeRowCount(sc *stmtctx.StatementContext, lDatum, rD actualL = l actualR = r // Handling the out-of-range part on the right of the histogram range - if l < boundR && r > histR { - if l < histR { + if actualL < boundR && actualR > histR { + // make sure histR <= actualL < actualR <= boundR + if actualL < histR { actualL = histR } - if r > boundR { - r = boundR + if actualR > boundR { + actualR = boundR } // Calculate the percentage of "the shaded area" on the right side. rightPercent = (math.Pow(boundR-actualL, 2) - math.Pow(boundR-actualR, 2)) / math.Pow(histWidth, 2) @@ -1069,7 +1072,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en if c.Histogram.Bounds.NumRows() == 0 { return 0.0, nil } - if c.Histogram.NDV > 0 && c.outOfRange(val, encodedVal) { + if c.Histogram.NDV > 0 && c.outOfRange(val) { return outOfRangeEQSelectivity(c.Histogram.NDV, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { @@ -1181,7 +1184,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range continue } - // case 3: it's a interval + // case 3: it's an interval cnt := c.BetweenRowCount(sc, lowVal, highVal, lowEncoded, highEncoded) // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boundaries here. // Note that, `cnt` does not include null values, we need specially handle cases @@ -1214,7 +1217,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range cnt *= c.GetIncreaseFactor(tableRowCount) // handling the out-of-range part - if (c.outOfRange(lowVal, lowEncoded) && !lowVal.IsNull()) || c.outOfRange(highVal, highEncoded) { + if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { if c.StatsVer < 2 { cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount() } else { @@ -1222,7 +1225,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range if increaseCount < 0 { increaseCount = 0 } - cnt += c.Histogram.outOfRangeRowCount(sc, &lowVal, &highVal, increaseCount) + cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount) } } @@ -1236,15 +1239,6 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range return rowCount, nil } -func (c *Column) outOfRange(val types.Datum, encodedVal []byte) bool { - outOfHist := c.Histogram.outOfRange(val) - if !outOfHist { - return false - } - // Already out of hist. - return c.TopN.outOfRange(encodedVal) -} - // Index represents an index histogram. type Index struct { Histogram @@ -1360,7 +1354,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde } } - // case 2: it's a interval + // case 2: it's an interval // The final interval is [low, high) if indexRange.LowExclude { lb = kv.Key(lb).PrefixNext() @@ -1402,7 +1396,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde if increaseCount < 0 { increaseCount = 0 } - totalCount += idx.Histogram.outOfRangeRowCount(sc, &l, &r, increaseCount) + totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount) } } } @@ -1651,21 +1645,13 @@ func (coll *HistColl) NewHistCollBySelectivity(sc *stmtctx.StatementContext, sta } func (idx *Index) outOfRange(val types.Datum) bool { - outOfTopN := idx.TopN.outOfRange(val.GetBytes()) - // The val is in TopN, return false. - if !outOfTopN { + if !idx.Histogram.outOfRange(val) { return false } - - histEmpty := idx.Histogram.Len() == 0 - // HistEmpty->Hist out of range. - if histEmpty { - return true + if idx.Histogram.Len() > 0 && matchPrefix(idx.Bounds.GetRow(0), 0, &val) { + return false } - withInLowBoundOrPrefixMatch := chunk.Compare(idx.Bounds.GetRow(0), 0, &val) <= 0 || - matchPrefix(idx.Bounds.GetRow(0), 0, &val) - withInHighBound := chunk.Compare(idx.Bounds.GetRow(idx.Bounds.NumRows()-1), 0, &val) >= 0 - return !withInLowBoundOrPrefixMatch || !withInHighBound + return true } // matchPrefix checks whether ad is the prefix of value From 21f442f587e3b206dad6c24ff5c8439722a389c8 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 23 Jul 2021 15:52:55 +0800 Subject: [PATCH 04/20] fix tests --- .../r/explain_complex_stats.result | 28 +++++++++---------- cmd/explaintest/r/explain_union_scan.result | 6 ++-- statistics/table.go | 7 +++-- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/cmd/explaintest/r/explain_complex_stats.result b/cmd/explaintest/r/explain_complex_stats.result index 46fed25d76094..f9f1922595248 100644 --- a/cmd/explaintest/r/explain_complex_stats.result +++ b/cmd/explaintest/r/explain_complex_stats.result @@ -115,14 +115,14 @@ PRIMARY KEY (aid,dic) load stats 's/explain_complex_stats_rr.json'; explain format = 'brief' SELECT ds, p1, p2, p3, p4, p5, p6_md5, p7_md5, count(dic) as install_device FROM dt use index (cm) WHERE (ds >= '2016-09-01') AND (ds <= '2016-11-03') AND (cm IN ('1062', '1086', '1423', '1424', '1425', '1426', '1427', '1428', '1429', '1430', '1431', '1432', '1433', '1434', '1435', '1436', '1437', '1438', '1439', '1440', '1441', '1442', '1443', '1444', '1445', '1446', '1447', '1448', '1449', '1450', '1451', '1452', '1488', '1489', '1490', '1491', '1492', '1493', '1494', '1495', '1496', '1497', '1550', '1551', '1552', '1553', '1554', '1555', '1556', '1557', '1558', '1559', '1597', '1598', '1599', '1600', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1613', '1614', '1615', '1616', '1623', '1624', '1625', '1626', '1627', '1628', '1629', '1630', '1631', '1632', '1709', '1719', '1720', '1843', '2813', '2814', '2815', '2816', '2817', '2818', '2819', '2820', '2821', '2822', '2823', '2824', '2825', '2826', '2827', '2828', '2829', '2830', '2831', '2832', '2833', '2834', '2835', '2836', '2837', '2838', '2839', '2840', '2841', '2842', '2843', '2844', '2845', '2846', '2847', '2848', '2849', '2850', '2851', '2852', '2853', '2854', '2855', '2856', '2857', '2858', '2859', '2860', '2861', '2862', '2863', '2864', '2865', '2866', '2867', '2868', '2869', '2870', '2871', '2872', '3139', '3140', '3141', '3142', '3143', '3144', '3145', '3146', '3147', '3148', '3149', '3150', '3151', '3152', '3153', '3154', '3155', '3156', '3157', '3158', '3386', '3387', '3388', '3389', '3390', '3391', '3392', '3393', '3394', '3395', '3664', '3665', '3666', '3667', '3668', '3670', '3671', '3672', '3673', '3674', '3676', '3677', '3678', '3679', '3680', '3681', '3682', '3683', '3684', '3685', '3686', '3687', '3688', '3689', '3690', '3691', '3692', '3693', '3694', '3695', '3696', '3697', '3698', '3699', '3700', '3701', '3702', '3703', '3704', '3705', '3706', '3707', '3708', '3709', '3710', '3711', '3712', '3713', '3714', '3715', '3960', '3961', '3962', '3963', '3964', '3965', '3966', '3967', '3968', '3978', '3979', '3980', '3981', '3982', '3983', '3984', '3985', '3986', '3987', '4208', '4209', '4210', '4211', '4212', '4304', '4305', '4306', '4307', '4308', '4866', '4867', '4868', '4869', '4870', '4871', '4872', '4873', '4874', '4875')) GROUP BY ds, p1, p2, p3, p4, p5, p6_md5, p7_md5 ORDER BY ds2 DESC; id estRows task access object operator info -Projection 21.53 root test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, Column#21 -└─Sort 21.53 root test.dt.ds2:desc - └─HashAgg 21.53 root group by:test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, funcs:count(Column#32)->Column#21, funcs:firstrow(test.dt.ds)->test.dt.ds, funcs:firstrow(Column#34)->test.dt.ds2, funcs:firstrow(test.dt.p1)->test.dt.p1, funcs:firstrow(test.dt.p2)->test.dt.p2, funcs:firstrow(test.dt.p3)->test.dt.p3, funcs:firstrow(test.dt.p4)->test.dt.p4, funcs:firstrow(test.dt.p5)->test.dt.p5, funcs:firstrow(test.dt.p6_md5)->test.dt.p6_md5, funcs:firstrow(test.dt.p7_md5)->test.dt.p7_md5 - └─IndexLookUp 21.53 root - ├─IndexRangeScan(Build) 128.32 cop[tikv] table:dt, index:cm(cm) range:[1062,1062], [1086,1086], [1423,1423], [1424,1424], [1425,1425], [1426,1426], [1427,1427], [1428,1428], [1429,1429], [1430,1430], [1431,1431], [1432,1432], [1433,1433], [1434,1434], [1435,1435], [1436,1436], [1437,1437], [1438,1438], [1439,1439], [1440,1440], [1441,1441], [1442,1442], [1443,1443], [1444,1444], [1445,1445], [1446,1446], [1447,1447], [1448,1448], [1449,1449], [1450,1450], [1451,1451], [1452,1452], [1488,1488], [1489,1489], [1490,1490], [1491,1491], [1492,1492], [1493,1493], [1494,1494], [1495,1495], [1496,1496], [1497,1497], [1550,1550], [1551,1551], [1552,1552], [1553,1553], [1554,1554], [1555,1555], [1556,1556], [1557,1557], [1558,1558], [1559,1559], [1597,1597], [1598,1598], [1599,1599], [1600,1600], [1601,1601], [1602,1602], [1603,1603], [1604,1604], [1605,1605], [1606,1606], [1607,1607], [1608,1608], [1609,1609], [1610,1610], [1611,1611], [1612,1612], [1613,1613], [1614,1614], [1615,1615], [1616,1616], [1623,1623], [1624,1624], [1625,1625], [1626,1626], [1627,1627], [1628,1628], [1629,1629], [1630,1630], [1631,1631], [1632,1632], [1709,1709], [1719,1719], [1720,1720], [1843,1843], [2813,2813], [2814,2814], [2815,2815], [2816,2816], [2817,2817], [2818,2818], [2819,2819], [2820,2820], [2821,2821], [2822,2822], [2823,2823], [2824,2824], [2825,2825], [2826,2826], [2827,2827], [2828,2828], [2829,2829], [2830,2830], [2831,2831], [2832,2832], [2833,2833], [2834,2834], [2835,2835], [2836,2836], [2837,2837], [2838,2838], [2839,2839], [2840,2840], [2841,2841], [2842,2842], [2843,2843], [2844,2844], [2845,2845], [2846,2846], [2847,2847], [2848,2848], [2849,2849], [2850,2850], [2851,2851], [2852,2852], [2853,2853], [2854,2854], [2855,2855], [2856,2856], [2857,2857], [2858,2858], [2859,2859], [2860,2860], [2861,2861], [2862,2862], [2863,2863], [2864,2864], [2865,2865], [2866,2866], [2867,2867], [2868,2868], [2869,2869], [2870,2870], [2871,2871], [2872,2872], [3139,3139], [3140,3140], [3141,3141], [3142,3142], [3143,3143], [3144,3144], [3145,3145], [3146,3146], [3147,3147], [3148,3148], [3149,3149], [3150,3150], [3151,3151], [3152,3152], [3153,3153], [3154,3154], [3155,3155], [3156,3156], [3157,3157], [3158,3158], [3386,3386], [3387,3387], [3388,3388], [3389,3389], [3390,3390], [3391,3391], [3392,3392], [3393,3393], [3394,3394], [3395,3395], [3664,3664], [3665,3665], [3666,3666], [3667,3667], [3668,3668], [3670,3670], [3671,3671], [3672,3672], [3673,3673], [3674,3674], [3676,3676], [3677,3677], [3678,3678], [3679,3679], [3680,3680], [3681,3681], [3682,3682], [3683,3683], [3684,3684], [3685,3685], [3686,3686], [3687,3687], [3688,3688], [3689,3689], [3690,3690], [3691,3691], [3692,3692], [3693,3693], [3694,3694], [3695,3695], [3696,3696], [3697,3697], [3698,3698], [3699,3699], [3700,3700], [3701,3701], [3702,3702], [3703,3703], [3704,3704], [3705,3705], [3706,3706], [3707,3707], [3708,3708], [3709,3709], [3710,3710], [3711,3711], [3712,3712], [3713,3713], [3714,3714], [3715,3715], [3960,3960], [3961,3961], [3962,3962], [3963,3963], [3964,3964], [3965,3965], [3966,3966], [3967,3967], [3968,3968], [3978,3978], [3979,3979], [3980,3980], [3981,3981], [3982,3982], [3983,3983], [3984,3984], [3985,3985], [3986,3986], [3987,3987], [4208,4208], [4209,4209], [4210,4210], [4211,4211], [4212,4212], [4304,4304], [4305,4305], [4306,4306], [4307,4307], [4308,4308], [4866,4866], [4867,4867], [4868,4868], [4869,4869], [4870,4870], [4871,4871], [4872,4872], [4873,4873], [4874,4874], [4875,4875], keep order:false - └─HashAgg(Probe) 21.53 cop[tikv] group by:test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, funcs:count(test.dt.dic)->Column#32, funcs:firstrow(test.dt.ds2)->Column#34 - └─Selection 21.56 cop[tikv] ge(test.dt.ds, 2016-09-01 00:00:00.000000), le(test.dt.ds, 2016-11-03 00:00:00.000000) - └─TableRowIDScan 128.32 cop[tikv] table:dt keep order:false +Projection 21.47 root test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, Column#21 +└─Sort 21.47 root test.dt.ds2:desc + └─HashAgg 21.47 root group by:test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, funcs:count(Column#32)->Column#21, funcs:firstrow(test.dt.ds)->test.dt.ds, funcs:firstrow(Column#34)->test.dt.ds2, funcs:firstrow(test.dt.p1)->test.dt.p1, funcs:firstrow(test.dt.p2)->test.dt.p2, funcs:firstrow(test.dt.p3)->test.dt.p3, funcs:firstrow(test.dt.p4)->test.dt.p4, funcs:firstrow(test.dt.p5)->test.dt.p5, funcs:firstrow(test.dt.p6_md5)->test.dt.p6_md5, funcs:firstrow(test.dt.p7_md5)->test.dt.p7_md5 + └─IndexLookUp 21.47 root + ├─IndexRangeScan(Build) 128.00 cop[tikv] table:dt, index:cm(cm) range:[1062,1062], [1086,1086], [1423,1423], [1424,1424], [1425,1425], [1426,1426], [1427,1427], [1428,1428], [1429,1429], [1430,1430], [1431,1431], [1432,1432], [1433,1433], [1434,1434], [1435,1435], [1436,1436], [1437,1437], [1438,1438], [1439,1439], [1440,1440], [1441,1441], [1442,1442], [1443,1443], [1444,1444], [1445,1445], [1446,1446], [1447,1447], [1448,1448], [1449,1449], [1450,1450], [1451,1451], [1452,1452], [1488,1488], [1489,1489], [1490,1490], [1491,1491], [1492,1492], [1493,1493], [1494,1494], [1495,1495], [1496,1496], [1497,1497], [1550,1550], [1551,1551], [1552,1552], [1553,1553], [1554,1554], [1555,1555], [1556,1556], [1557,1557], [1558,1558], [1559,1559], [1597,1597], [1598,1598], [1599,1599], [1600,1600], [1601,1601], [1602,1602], [1603,1603], [1604,1604], [1605,1605], [1606,1606], [1607,1607], [1608,1608], [1609,1609], [1610,1610], [1611,1611], [1612,1612], [1613,1613], [1614,1614], [1615,1615], [1616,1616], [1623,1623], [1624,1624], [1625,1625], [1626,1626], [1627,1627], [1628,1628], [1629,1629], [1630,1630], [1631,1631], [1632,1632], [1709,1709], [1719,1719], [1720,1720], [1843,1843], [2813,2813], [2814,2814], [2815,2815], [2816,2816], [2817,2817], [2818,2818], [2819,2819], [2820,2820], [2821,2821], [2822,2822], [2823,2823], [2824,2824], [2825,2825], [2826,2826], [2827,2827], [2828,2828], [2829,2829], [2830,2830], [2831,2831], [2832,2832], [2833,2833], [2834,2834], [2835,2835], [2836,2836], [2837,2837], [2838,2838], [2839,2839], [2840,2840], [2841,2841], [2842,2842], [2843,2843], [2844,2844], [2845,2845], [2846,2846], [2847,2847], [2848,2848], [2849,2849], [2850,2850], [2851,2851], [2852,2852], [2853,2853], [2854,2854], [2855,2855], [2856,2856], [2857,2857], [2858,2858], [2859,2859], [2860,2860], [2861,2861], [2862,2862], [2863,2863], [2864,2864], [2865,2865], [2866,2866], [2867,2867], [2868,2868], [2869,2869], [2870,2870], [2871,2871], [2872,2872], [3139,3139], [3140,3140], [3141,3141], [3142,3142], [3143,3143], [3144,3144], [3145,3145], [3146,3146], [3147,3147], [3148,3148], [3149,3149], [3150,3150], [3151,3151], [3152,3152], [3153,3153], [3154,3154], [3155,3155], [3156,3156], [3157,3157], [3158,3158], [3386,3386], [3387,3387], [3388,3388], [3389,3389], [3390,3390], [3391,3391], [3392,3392], [3393,3393], [3394,3394], [3395,3395], [3664,3664], [3665,3665], [3666,3666], [3667,3667], [3668,3668], [3670,3670], [3671,3671], [3672,3672], [3673,3673], [3674,3674], [3676,3676], [3677,3677], [3678,3678], [3679,3679], [3680,3680], [3681,3681], [3682,3682], [3683,3683], [3684,3684], [3685,3685], [3686,3686], [3687,3687], [3688,3688], [3689,3689], [3690,3690], [3691,3691], [3692,3692], [3693,3693], [3694,3694], [3695,3695], [3696,3696], [3697,3697], [3698,3698], [3699,3699], [3700,3700], [3701,3701], [3702,3702], [3703,3703], [3704,3704], [3705,3705], [3706,3706], [3707,3707], [3708,3708], [3709,3709], [3710,3710], [3711,3711], [3712,3712], [3713,3713], [3714,3714], [3715,3715], [3960,3960], [3961,3961], [3962,3962], [3963,3963], [3964,3964], [3965,3965], [3966,3966], [3967,3967], [3968,3968], [3978,3978], [3979,3979], [3980,3980], [3981,3981], [3982,3982], [3983,3983], [3984,3984], [3985,3985], [3986,3986], [3987,3987], [4208,4208], [4209,4209], [4210,4210], [4211,4211], [4212,4212], [4304,4304], [4305,4305], [4306,4306], [4307,4307], [4308,4308], [4866,4866], [4867,4867], [4868,4868], [4869,4869], [4870,4870], [4871,4871], [4872,4872], [4873,4873], [4874,4874], [4875,4875], keep order:false + └─HashAgg(Probe) 21.47 cop[tikv] group by:test.dt.ds, test.dt.p1, test.dt.p2, test.dt.p3, test.dt.p4, test.dt.p5, test.dt.p6_md5, test.dt.p7_md5, funcs:count(test.dt.dic)->Column#32, funcs:firstrow(test.dt.ds2)->Column#34 + └─Selection 21.50 cop[tikv] ge(test.dt.ds, 2016-09-01 00:00:00.000000), le(test.dt.ds, 2016-11-03 00:00:00.000000) + └─TableRowIDScan 128.00 cop[tikv] table:dt keep order:false explain format = 'brief' select gad.id as gid,sdk.id as sid,gad.aid as aid,gad.cm as cm,sdk.dic as dic,sdk.ip as ip, sdk.t as t, gad.p1 as p1, gad.p2 as p2, gad.p3 as p3, gad.p4 as p4, gad.p5 as p5, gad.p6_md5 as p6, gad.p7_md5 as p7, gad.ext as ext, gad.t as gtime from st gad join (select id, aid, pt, dic, ip, t from dd where pt = 'android' and bm = 0 and t > 1478143908) sdk on gad.aid = sdk.aid and gad.ip = sdk.ip and sdk.t > gad.t where gad.t > 1478143908 and gad.bm = 0 and gad.pt = 'android' group by gad.aid, sdk.dic limit 2500; id estRows task access object operator info Projection 424.00 root test.st.id, test.dd.id, test.st.aid, test.st.cm, test.dd.dic, test.dd.ip, test.dd.t, test.st.p1, test.st.p2, test.st.p3, test.st.p4, test.st.p5, test.st.p6_md5, test.st.p7_md5, test.st.ext, test.st.t @@ -132,8 +132,8 @@ Projection 424.00 root test.st.id, test.dd.id, test.st.aid, test.st.cm, test.dd ├─TableReader(Build) 424.00 root data:Selection │ └─Selection 424.00 cop[tikv] eq(test.st.bm, 0), eq(test.st.pt, "android"), gt(test.st.t, 1478143908), not(isnull(test.st.ip)) │ └─TableRangeScan 1999.00 cop[tikv] table:gad range:[0,+inf], keep order:false - └─TableReader(Probe) 455.80 root data:Selection - └─Selection 455.80 cop[tikv] eq(test.dd.bm, 0), eq(test.dd.pt, "android"), gt(test.dd.t, 1478143908), not(isnull(test.dd.ip)), not(isnull(test.dd.t)) + └─TableReader(Probe) 450.56 root data:Selection + └─Selection 450.56 cop[tikv] eq(test.dd.bm, 0), eq(test.dd.pt, "android"), gt(test.dd.t, 1478143908), not(isnull(test.dd.ip)), not(isnull(test.dd.t)) └─TableRangeScan 2000.00 cop[tikv] table:dd range:[0,+inf], keep order:false explain format = 'brief' select gad.id as gid,sdk.id as sid,gad.aid as aid,gad.cm as cm,sdk.dic as dic,sdk.ip as ip, sdk.t as t, gad.p1 as p1, gad.p2 as p2, gad.p3 as p3, gad.p4 as p4, gad.p5 as p5, gad.p6_md5 as p6, gad.p7_md5 as p7, gad.ext as ext from st gad join dd sdk on gad.aid = sdk.aid and gad.dic = sdk.mac and gad.t < sdk.t where gad.t > 1477971479 and gad.bm = 0 and gad.pt = 'ios' and gad.dit = 'mac' and sdk.t > 1477971479 and sdk.bm = 0 and sdk.pt = 'ios' limit 3000; id estRows task access object operator info @@ -169,11 +169,11 @@ Projection 428.32 root test.dt.id, test.dt.aid, test.dt.pt, test.dt.dic, test.d └─TableRowIDScan 1.00 cop[tikv] table:rr keep order:false explain format = 'brief' select pc,cr,count(DISTINCT uid) as pay_users,count(oid) as pay_times,sum(am) as am from pp where ps=2 and ppt>=1478188800 and ppt<1478275200 and pi in ('510017','520017') and uid in ('18089709','18090780') group by pc,cr; id estRows task access object operator info -Projection 207.86 root test.pp.pc, test.pp.cr, Column#22, Column#23, Column#24 -└─HashAgg 207.86 root group by:test.pp.cr, test.pp.pc, funcs:count(distinct test.pp.uid)->Column#22, funcs:count(test.pp.oid)->Column#23, funcs:sum(test.pp.am)->Column#24, funcs:firstrow(test.pp.pc)->test.pp.pc, funcs:firstrow(test.pp.cr)->test.pp.cr - └─IndexLookUp 207.86 root +Projection 207.02 root test.pp.pc, test.pp.cr, Column#22, Column#23, Column#24 +└─HashAgg 207.02 root group by:test.pp.cr, test.pp.pc, funcs:count(distinct test.pp.uid)->Column#22, funcs:count(test.pp.oid)->Column#23, funcs:sum(test.pp.am)->Column#24, funcs:firstrow(test.pp.pc)->test.pp.pc, funcs:firstrow(test.pp.cr)->test.pp.cr + └─IndexLookUp 207.02 root ├─IndexRangeScan(Build) 627.00 cop[tikv] table:pp, index:ps(ps) range:[2,2], keep order:false - └─Selection(Probe) 207.86 cop[tikv] ge(test.pp.ppt, 1478188800), in(test.pp.pi, 510017, 520017), in(test.pp.uid, 18089709, 18090780), lt(test.pp.ppt, 1478275200) + └─Selection(Probe) 207.02 cop[tikv] ge(test.pp.ppt, 1478188800), in(test.pp.pi, 510017, 520017), in(test.pp.uid, 18089709, 18090780), lt(test.pp.ppt, 1478275200) └─TableRowIDScan 627.00 cop[tikv] table:pp keep order:false drop table if exists tbl_001; CREATE TABLE tbl_001 (a int, b int); diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result index 1ef48623efd4a..610929935ec63 100644 --- a/cmd/explaintest/r/explain_union_scan.result +++ b/cmd/explaintest/r/explain_union_scan.result @@ -24,8 +24,8 @@ Limit 10.00 root offset:0, count:10 │ ├─IndexRangeScan(Build) 1.00 cop[tikv] table:t1, index:PRIMARY(id) range: decided by [eq(test.city.id, test.city.id)], keep order:false │ └─Selection(Probe) 1.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100) │ └─TableRowIDScan 1.00 cop[tikv] table:t1 keep order:false - └─UnionScan(Probe) 536284.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) - └─TableReader 536284.00 root data:Selection - └─Selection 536284.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─UnionScan(Probe) 536040.03 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─TableReader 536040.03 root data:Selection + └─Selection 536040.03 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) └─TableFullScan 536284.00 cop[tikv] table:t3 keep order:false commit; diff --git a/statistics/table.go b/statistics/table.go index ecadda9d5a5d6..46b1ce47f6c3b 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -264,7 +264,7 @@ func (t *Table) ColumnGreaterRowCount(sc *stmtctx.StatementContext, value types. if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoLessRate } - return c.greaterRowCount(value) + return c.greaterRowCount(value) * c.GetIncreaseFactor(t.Count) } // ColumnLessRowCount estimates the row count where the column less than value. Note that null values are not counted. @@ -273,7 +273,7 @@ func (t *Table) ColumnLessRowCount(sc *stmtctx.StatementContext, value types.Dat if !ok || c.IsInvalid(sc, t.Pseudo) { return float64(t.Count) / pseudoLessRate } - return c.lessRowCount(value) + return c.lessRowCount(value) * c.GetIncreaseFactor(t.Count) } // ColumnBetweenRowCount estimates the row count where column greater or equal to a and less than b. @@ -294,7 +294,7 @@ func (t *Table) ColumnBetweenRowCount(sc *stmtctx.StatementContext, a, b types.D if a.IsNull() { count += float64(c.NullCount) } - return count, nil + return count * c.GetIncreaseFactor(t.Count), nil } // ColumnEqualRowCount estimates the row count where the column equals to value. @@ -308,6 +308,7 @@ func (t *Table) ColumnEqualRowCount(sc *stmtctx.StatementContext, value types.Da return 0, err } result, err := c.equalRowCount(sc, value, encodedVal, t.ModifyCount) + result *= c.GetIncreaseFactor(t.Count) return result, errors.Trace(err) } From 6051b0c7a269f126ecd735d7aa01218daa87f4d3 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 23 Jul 2021 19:59:40 +0800 Subject: [PATCH 05/20] add test --- statistics/selectivity_test.go | 40 ++++++++++---- statistics/testdata/stats_suite_in.json | 57 +++++++++++++++++++ statistics/testdata/stats_suite_out.json | 70 ++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 10 deletions(-) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index b173cb5bc4db6..bc6ff8697f773 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -382,16 +382,16 @@ func getRange(start, end int64) []*ranger.Range { return []*ranger.Range{ran} } -func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) { +func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { defer cleanEnv(c, s.store, s.do) testKit := testkit.NewTestKit(c, s.store) testKit.MustExec("use test") testKit.MustExec("drop table if exists t") testKit.MustExec("create table t(a int)") - for i := 0; i < 1000; i++ { - testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249 + for i := 0; i < 3000; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5)) // [0, 600) } - testKit.MustExec("analyze table t") + testKit.MustExec("analyze table t with 1000 samples") h := s.do.StatsHandle() table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) @@ -399,14 +399,34 @@ func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) { statsTbl := h.GetTableStats(table.Meta()) sc := &stmtctx.StatementContext{} col := statsTbl.Columns[table.Meta().Columns[0].ID] - count, err := col.GetColumnRowCount(sc, getRange(250, 250), statsTbl.Count, false) + count, err := col.GetColumnRowCount(sc, getRange(600, 600), statsTbl.Count, false) c.Assert(err, IsNil) - c.Assert(count, Equals, float64(0)) - - for i := 0; i < 8; i++ { - count, err := col.GetColumnRowCount(sc, getRange(250, 250), statsTbl.Count+int64(i)+1, false) + // Because the ANALYZE collect data by random sampling, so the result is not an accurate value. + // so we use a range here. + c.Assert(count < 4.5, IsTrue) + c.Assert(count > 3.5, IsTrue) + + var input []struct { + Start int64 + End int64 + } + var output []struct { + Start int64 + End int64 + Count float64 + } + s.testData.GetTestCases(c, &input, &output) + increasedTblRowCount := int64(float64(statsTbl.Count) * 1.5) + for i, ran := range input { + count, err = col.GetColumnRowCount(sc, getRange(ran.Start, ran.End), increasedTblRowCount, false) c.Assert(err, IsNil) - c.Assert(count < math.Min(float64(i+1), 4), IsTrue) // estRows must be less than modifyCnt + s.testData.OnRecord(func() { + output[i].Start = ran.Start + output[i].End = ran.End + output[i].Count = count + }) + c.Assert(count < output[i].Count*1.2, IsTrue) + c.Assert(count > output[i].Count*0.8, IsTrue) } } diff --git a/statistics/testdata/stats_suite_in.json b/statistics/testdata/stats_suite_in.json index 23caf6f572a49..ad82329f3b1bd 100644 --- a/statistics/testdata/stats_suite_in.json +++ b/statistics/testdata/stats_suite_in.json @@ -137,5 +137,62 @@ "select * from t where a > 8 or d < 4 or c > 7 or b < 5", "select * from t where a < 8 and (b > 10 or c < 3 or b > 4) and a > 2" ] + }, + { + "Name": "TestOutOfRangeEstimation", + "Cases": [ + { + "Start": 500, + "End": 600 + }, + { + "Start": 600, + "End": 650 + }, + { + "Start": 650, + "End": 700 + }, + { + "Start": 700, + "End": 750 + }, + { + "Start": 750, + "End": 800 + }, + { + "Start": 850, + "End": 900 + }, + { + "Start": 900, + "End": 1000 + }, + { + "Start": 1000, + "End": 1100 + }, + { + "Start": 1100, + "End": 1200 + }, + { + "Start": 1200, + "End": 1300 + }, + { + "Start": 0, + "End": 599 + }, + { + "Start": 600, + "End": 1200 + }, + { + "Start": 0, + "End": 1200 + } + ] } ] diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index 6850d68170069..4925142a4fe1a 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -609,5 +609,75 @@ "Selectivity": 0 } ] + }, + { + "Name": "TestOutOfRangeEstimation", + "Cases": [ + { + "Start": 500, + "End": 600, + "Count": 802.9395216313737 + }, + { + "Start": 600, + "End": 650, + "Count": 246.23240144080611 + }, + { + "Start": 650, + "End": 700, + "Count": 225.18916179197572 + }, + { + "Start": 700, + "End": 750, + "Count": 204.1459221431453 + }, + { + "Start": 750, + "End": 800, + "Count": 183.1026824943149 + }, + { + "Start": 850, + "End": 900, + "Count": 141.01620319665406 + }, + { + "Start": 900, + "End": 1000, + "Count": 212.9840827956541 + }, + { + "Start": 1000, + "End": 1100, + "Count": 128.81112420033242 + }, + { + "Start": 1100, + "End": 1200, + "Count": 44.70550397188701 + }, + { + "Start": 1200, + "End": 1300, + "Count": 5.9186046511627906 + }, + { + "Start": 0, + "End": 599, + "Count": 4500 + }, + { + "Start": 600, + "End": 1200, + "Count": 1500.8976876709517 + }, + { + "Start": 0, + "End": 1200, + "Count": 4500 + } + ] } ] From d5279ce49e4816d1574f058f532b447959d617ec Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 12:04:07 +0800 Subject: [PATCH 06/20] change calculation method --- cmd/explaintest/r/explain_union_scan.result | 6 +++--- statistics/histogram.go | 20 +++++++++---------- statistics/selectivity_test.go | 10 +++++----- statistics/testdata/stats_suite_out.json | 22 ++++++++++----------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result index 610929935ec63..1ef48623efd4a 100644 --- a/cmd/explaintest/r/explain_union_scan.result +++ b/cmd/explaintest/r/explain_union_scan.result @@ -24,8 +24,8 @@ Limit 10.00 root offset:0, count:10 │ ├─IndexRangeScan(Build) 1.00 cop[tikv] table:t1, index:PRIMARY(id) range: decided by [eq(test.city.id, test.city.id)], keep order:false │ └─Selection(Probe) 1.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100) │ └─TableRowIDScan 1.00 cop[tikv] table:t1 keep order:false - └─UnionScan(Probe) 536040.03 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) - └─TableReader 536040.03 root data:Selection - └─Selection 536040.03 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─UnionScan(Probe) 536284.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─TableReader 536284.00 root data:Selection + └─Selection 536284.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) └─TableFullScan 536284.00 cop[tikv] table:t3 keep order:false commit; diff --git a/statistics/histogram.go b/statistics/histogram.go index 2f8ed8b6a7ca0..bb72d6cfb88d0 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -921,11 +921,15 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou rightPercent = (math.Pow(boundR-actualL, 2) - math.Pow(boundR-actualR, 2)) / math.Pow(histWidth, 2) } - totalPercent := leftPercent + rightPercent + totalPercent := leftPercent*0.5 + rightPercent*0.5 if totalPercent > 1 { totalPercent = 1 } - return totalPercent * float64(increaseCount) + rowCount := totalPercent * hg.notNullCount() + if rowCount > float64(increaseCount) { + return float64(increaseCount) + } + return rowCount } // Copy deep copies the histogram. @@ -1218,15 +1222,11 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // handling the out-of-range part if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { - if c.StatsVer < 2 { - cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount() - } else { - increaseCount := tableRowCount - int64(c.TotalRowCount()) - if increaseCount < 0 { - increaseCount = 0 - } - cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount) + increaseCount := tableRowCount - int64(c.TotalRowCount()) + if increaseCount < 0 { + increaseCount = 0 } + cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, increaseCount) } rowCount += cnt diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index bc6ff8697f773..02e9dca25e51e 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -260,7 +260,7 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "a >= 1 and b > 1 and a < 2", selectivity: 0.01783264746, - selectivityAfterIncrease: 0.01803635116, + selectivityAfterIncrease: 0.01801783264, }, { exprs: "a >= 1 and c > 1 and a < 2", @@ -280,12 +280,12 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "b > 1", selectivity: 0.96296296296, - selectivityAfterIncrease: 0.97396296296, + selectivityAfterIncrease: 0.97296296296, }, { exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", selectivity: 0, - selectivityAfterIncrease: 0.00003333788, + selectivityAfterIncrease: 0.00003239205, }, { exprs: longExpr, @@ -461,11 +461,11 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.3000000000000003) + c.Assert(count, Equals, 7.2) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.3000000000000003) + c.Assert(count, Equals, 7.2) idxID := table.Meta().Indices[0].ID count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30)) diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index 4925142a4fe1a..4193ee02f5201 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -616,52 +616,52 @@ { "Start": 500, "End": 600, - "Count": 802.9395216313737 + "Count": 722.1217353719635 }, { "Start": 600, "End": 650, - "Count": 246.23240144080611 + "Count": 484.7090567562406 }, { "Start": 650, "End": 700, - "Count": 225.18916179197572 + "Count": 442.9031532052471 }, { "Start": 700, "End": 750, - "Count": 204.1459221431453 + "Count": 401.0972496542537 }, { "Start": 750, "End": 800, - "Count": 183.1026824943149 + "Count": 359.2913461032602 }, { "Start": 850, "End": 900, - "Count": 141.01620319665406 + "Count": 275.6795390012733 }, { "Start": 900, "End": 1000, - "Count": 212.9840827956541 + "Count": 420.3279652877107 }, { "Start": 1000, "End": 1100, - "Count": 128.81112420033242 + "Count": 253.10435108373687 }, { "Start": 1100, "End": 1200, - "Count": 44.70550397188701 + "Count": 85.91418160260388 }, { "Start": 1200, "End": 1300, - "Count": 5.9186046511627906 + "Count": 5.61340206185567 }, { "Start": 0, @@ -671,7 +671,7 @@ { "Start": 600, "End": 1200, - "Count": 1500.8976876709517 + "Count": 1505.6134020618556 }, { "Start": 0, From 1065d5b172e73e7da5fb5e4e647e552a3ae6c541 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 13:47:54 +0800 Subject: [PATCH 07/20] fix test --- statistics/selectivity_test.go | 10 +++++----- statistics/testdata/stats_suite_out.json | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 02e9dca25e51e..6ef601bec45bc 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -391,7 +391,7 @@ func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { for i := 0; i < 3000; i++ { testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5)) // [0, 600) } - testKit.MustExec("analyze table t with 1000 samples") + testKit.MustExec("analyze table t with 2000 samples") h := s.do.StatsHandle() table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) @@ -403,8 +403,8 @@ func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { c.Assert(err, IsNil) // Because the ANALYZE collect data by random sampling, so the result is not an accurate value. // so we use a range here. - c.Assert(count < 4.5, IsTrue) - c.Assert(count > 3.5, IsTrue) + c.Assert(count < 5.5, IsTrue, Commentf("expected: around 5.0, got: %v", count)) + c.Assert(count > 4.5, IsTrue, Commentf("expected: around 5.0, got: %v", count)) var input []struct { Start int64 @@ -425,8 +425,8 @@ func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { output[i].End = ran.End output[i].Count = count }) - c.Assert(count < output[i].Count*1.2, IsTrue) - c.Assert(count > output[i].Count*0.8, IsTrue) + c.Assert(count < output[i].Count*1.2, IsTrue, Commentf("for [%v, %v], needed: around %v, got: %v", ran.Start, ran.End, output[i].Count, count)) + c.Assert(count > output[i].Count*0.8, IsTrue, Commentf("for [%v, %v], needed: around %v, got: %v", ran.Start, ran.End, output[i].Count, count)) } } diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index 4193ee02f5201..bf5244500b4bc 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -616,52 +616,52 @@ { "Start": 500, "End": 600, - "Count": 722.1217353719635 + "Count": 761.2535173044046 }, { "Start": 600, "End": 650, - "Count": 484.7090567562406 + "Count": 247.29717799654313 }, { "Start": 650, "End": 700, - "Count": 442.9031532052471 + "Count": 226.39422622104638 }, { "Start": 700, "End": 750, - "Count": 401.0972496542537 + "Count": 205.49127444554966 }, { "Start": 750, "End": 800, - "Count": 359.2913461032602 + "Count": 184.58832267005292 }, { "Start": 850, "End": 900, - "Count": 275.6795390012733 + "Count": 142.78241911905948 }, { "Start": 900, "End": 1000, - "Count": 420.3279652877107 + "Count": 215.10663226227817 }, { "Start": 1000, "End": 1100, - "Count": 253.10435108373687 + "Count": 131.49482516029124 }, { "Start": 1100, "End": 1200, - "Count": 85.91418160260388 + "Count": 47.89974041972475 }, { "Start": 1200, "End": 1300, - "Count": 5.61340206185567 + "Count": 7.749350649350649 }, { "Start": 0, @@ -671,7 +671,7 @@ { "Start": 600, "End": 1200, - "Count": 1505.6134020618556 + "Count": 1502.7451839942967 }, { "Start": 0, From fc21d226aafca6479e80737e05a168f2d3bbf2b7 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 15:52:16 +0800 Subject: [PATCH 08/20] handle MinNotNull and MaxValue and unsigned case --- statistics/histogram.go | 12 ++++++++++++ statistics/scalar.go | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/statistics/histogram.go b/statistics/histogram.go index bb72d6cfb88d0..0cfd68169d08f 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -873,6 +873,18 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou // Convert the range we want to estimate to scalar value(float64) l := convertDatumToScalar(lDatum, commonPrefix) r := convertDatumToScalar(rDatum, commonPrefix) + // If this is an unsigned column, we need to make sure values are not negative. + // Normal negative value should have become 0. But this still might happen when met MinNotNull here. + // Maybe it's better to do this transformation in the ranger like the normal negative value. + if mysql.HasUnsignedFlag(hg.Tp.Flag) { + if l < 0 { + l = 0 + } + if r < 0 { + r = 0 + } + } + // make sure l < r if l >= r { return 0 diff --git a/statistics/scalar.go b/statistics/scalar.go index 5f4b42db85246..df5a85c1631cd 100644 --- a/statistics/scalar.go +++ b/statistics/scalar.go @@ -80,6 +80,10 @@ func convertDatumToScalar(value *types.Datum, commonPfxLen int) float64 { return 0 } return convertBytesToScalar(bytes[commonPfxLen:]) + case types.KindMinNotNull: + return -math.MaxFloat64 + case types.KindMaxValue: + return math.MaxFloat64 default: // do not know how to convert return 0 From 82274c89f1773c08dd2699ddf9695cb22b833ecf Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 16:53:07 +0800 Subject: [PATCH 09/20] fix test --- statistics/selectivity_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 6ef601bec45bc..1e0ad6180844b 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -260,7 +260,7 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "a >= 1 and b > 1 and a < 2", selectivity: 0.01783264746, - selectivityAfterIncrease: 0.01801783264, + selectivityAfterIncrease: 0.01851851852, }, { exprs: "a >= 1 and c > 1 and a < 2", @@ -280,12 +280,12 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "b > 1", selectivity: 0.96296296296, - selectivityAfterIncrease: 0.97296296296, + selectivityAfterIncrease: 1, }, { exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", selectivity: 0, - selectivityAfterIncrease: 0.00003239205, + selectivityAfterIncrease: 0.00008258847, }, { exprs: longExpr, From bd7fd1bf769d045d9292024ae24283cc9d470be7 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 17:41:42 +0800 Subject: [PATCH 10/20] fix test --- statistics/handle/update_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go index 9b16edb0bf7af..8ad77a0a633c1 100644 --- a/statistics/handle/update_test.go +++ b/statistics/handle/update_test.go @@ -1815,8 +1815,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) { sql: "select * from t where a = 2 and b > 10", hist: "column:2 ndv:20 totColSize:20\n" + "num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0 ndv: 0\n" + - "num: 4 lower_bound: 7 upper_bound: 14 repeats: 0 ndv: 0\n" + - "num: 5 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0 ndv: 0", + "num: 6 lower_bound: 7 upper_bound: 14 repeats: 0 ndv: 0\n" + + "num: 8 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0 ndv: 0", rangeID: tblInfo.Columns[1].ID, idxID: tblInfo.Indices[0].ID, eqCount: 3, From fdf00e12d1c7a1f7590c91f12c1ae49f3fa4009b Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 18:23:24 +0800 Subject: [PATCH 11/20] add test for unsigned case --- statistics/selectivity_test.go | 6 +- statistics/testdata/stats_suite_in.json | 68 ++++++++++------ statistics/testdata/stats_suite_out.json | 99 +++++++++++++++--------- 3 files changed, 109 insertions(+), 64 deletions(-) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 1e0ad6180844b..f02a44ef6b749 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -387,9 +387,9 @@ func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { testKit := testkit.NewTestKit(c, s.store) testKit.MustExec("use test") testKit.MustExec("drop table if exists t") - testKit.MustExec("create table t(a int)") + testKit.MustExec("create table t(a int unsigned)") for i := 0; i < 3000; i++ { - testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5)) // [0, 600) + testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) // [300, 900) } testKit.MustExec("analyze table t with 2000 samples") @@ -399,7 +399,7 @@ func (s *testStatsSuite) TestOutOfRangeEstimation(c *C) { statsTbl := h.GetTableStats(table.Meta()) sc := &stmtctx.StatementContext{} col := statsTbl.Columns[table.Meta().Columns[0].ID] - count, err := col.GetColumnRowCount(sc, getRange(600, 600), statsTbl.Count, false) + count, err := col.GetColumnRowCount(sc, getRange(900, 900), statsTbl.Count, false) c.Assert(err, IsNil) // Because the ANALYZE collect data by random sampling, so the result is not an accurate value. // so we use a range here. diff --git a/statistics/testdata/stats_suite_in.json b/statistics/testdata/stats_suite_in.json index ad82329f3b1bd..c5182f253d76e 100644 --- a/statistics/testdata/stats_suite_in.json +++ b/statistics/testdata/stats_suite_in.json @@ -142,56 +142,76 @@ "Name": "TestOutOfRangeEstimation", "Cases": [ { - "Start": 500, - "End": 600 + "Start": 800, + "End": 900 }, { - "Start": 600, - "End": 650 + "Start": 900, + "End": 950 }, { - "Start": 650, - "End": 700 + "Start": 950, + "End": 1000 }, { - "Start": 700, - "End": 750 + "Start": 1000, + "End": 1050 }, { - "Start": 750, - "End": 800 + "Start": 1050, + "End": 1100 }, { - "Start": 850, - "End": 900 + "Start": 1150, + "End": 1200 + }, + { + "Start": 1200, + "End": 1300 + }, + { + "Start": 1300, + "End": 1400 + }, + { + "Start": 1400, + "End": 1500 + }, + { + "Start": 1500, + "End": 1600 + }, + { + "Start": 300, + "End": 899 }, { "Start": 900, - "End": 1000 + "End": 1500 }, { - "Start": 1000, - "End": 1100 + "Start": 300, + "End": 1500 }, { - "Start": 1100, - "End": 1200 + "Start": 200, + "End": 300 }, { - "Start": 1200, - "End": 1300 + "Start": 100, + "End": 200 }, { "Start": 0, - "End": 599 + "End": 100 }, { - "Start": 600, - "End": 1200 + "Start": -100, + "End": 100 }, { - "Start": 0, - "End": 1200 + "Start": -100, + "End": 0 } ] } diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index bf5244500b4bc..0dc6e595ac5ea 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -614,69 +614,94 @@ "Name": "TestOutOfRangeEstimation", "Cases": [ { - "Start": 500, - "End": 600, - "Count": 761.2535173044046 + "Start": 800, + "End": 900, + "Count": 762.7262445771319 }, { - "Start": 600, - "End": 650, - "Count": 247.29717799654313 + "Start": 900, + "End": 950, + "Count": 247.2699052692704 }, { - "Start": 650, - "End": 700, - "Count": 226.39422622104638 + "Start": 950, + "End": 1000, + "Count": 226.36695349377365 }, { - "Start": 700, - "End": 750, - "Count": 205.49127444554966 + "Start": 1000, + "End": 1050, + "Count": 205.46400171827693 }, { - "Start": 750, - "End": 800, - "Count": 184.58832267005292 + "Start": 1050, + "End": 1100, + "Count": 184.5610499427802 }, { - "Start": 850, - "End": 900, - "Count": 142.78241911905948 + "Start": 1150, + "End": 1200, + "Count": 142.75514639178675 }, { - "Start": 900, - "End": 1000, - "Count": 215.10663226227817 + "Start": 1200, + "End": 1300, + "Count": 215.07935953500544 }, { - "Start": 1000, - "End": 1100, - "Count": 131.49482516029124 + "Start": 1300, + "End": 1400, + "Count": 131.4675524330185 }, { - "Start": 1100, - "End": 1200, - "Count": 47.89974041972475 + "Start": 1400, + "End": 1500, + "Count": 47.87246769245203 }, { - "Start": 1200, - "End": 1300, - "Count": 7.749350649350649 + "Start": 1500, + "End": 1600, + "Count": 7.722077922077922 }, { - "Start": 0, - "End": 599, + "Start": 300, + "End": 899, "Count": 4500 }, { - "Start": 600, - "End": 1200, - "Count": 1502.7451839942967 + "Start": 900, + "End": 1500, + "Count": 1502.717911267024 }, { - "Start": 0, - "End": 1200, + "Start": 300, + "End": 1500, "Count": 4500 + }, + { + "Start": 200, + "End": 300, + "Count": 466.750898911986 + }, + { + "Start": 100, + "End": 200, + "Count": 383.13909180999906 + }, + { + "Start": 0, + "End": 100, + "Count": 299.5272847080122 + }, + { + "Start": -100, + "End": 100, + "Count": 299.5272847080122 + }, + { + "Start": -100, + "End": 0, + "Count": 7.722077922077922 } ] } From c00e0b0fc2bb59da75d53f01b4364aabc2efb0b9 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 27 Jul 2021 19:50:58 +0800 Subject: [PATCH 12/20] fix test --- planner/core/testdata/analyze_suite_out.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json index 416fa0940e412..ac434e5d18f25 100644 --- a/planner/core/testdata/analyze_suite_out.json +++ b/planner/core/testdata/analyze_suite_out.json @@ -463,8 +463,9 @@ "Plan": [ "Projection 5.95 root test.t.c, test.t.b", "└─TopN 5.95 root test.t.b, offset:0, count:6", - " └─IndexReader 5.95 root index:IndexRangeScan", - " └─IndexRangeScan 5.95 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" + " └─IndexReader 5.95 root index:TopN", + " └─TopN 5.95 cop[tikv] test.t.b, offset:0, count:6", + " └─IndexRangeScan 5.95 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" ] } ] From e1c8220a40a4fc9e82d00385ef482c6479a71ab2 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Wed, 28 Jul 2021 01:24:27 +0800 Subject: [PATCH 13/20] small fixup --- statistics/histogram.go | 13 +++++-------- statistics/selectivity_test.go | 8 ++++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index 0cfd68169d08f..905ec97756796 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -1399,17 +1399,14 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde // If the current table row count has changed, we should scale the row count accordingly. totalCount *= idx.GetIncreaseFactor(tableRowCount) + // handling the out-of-range part if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - if idx.StatsVer < 2 { - totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, tableRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() - } else { - increaseCount := tableRowCount - int64(idx.TotalRowCount()) - if increaseCount < 0 { - increaseCount = 0 - } - totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount) + increaseCount := tableRowCount - int64(idx.TotalRowCount()) + if increaseCount < 0 { + increaseCount = 0 } + totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount) } } if totalCount > float64(tableRowCount) { diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index f02a44ef6b749..001dfac5f9671 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -265,12 +265,12 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "a >= 1 and c > 1 and a < 2", selectivity: 0.00617283950, - selectivityAfterIncrease: 0.00619135802, + selectivityAfterIncrease: 0.00617283950, }, { exprs: "a >= 1 and c >= 1 and a < 2", selectivity: 0.01234567901, - selectivityAfterIncrease: 0.01236419753, + selectivityAfterIncrease: 0.01234567901, }, { exprs: "d = 0 and e = 1", @@ -285,7 +285,7 @@ func (s *testStatsSuite) TestSelectivity(c *C) { { exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", selectivity: 0, - selectivityAfterIncrease: 0.00008258847, + selectivityAfterIncrease: 0, }, { exprs: longExpr, @@ -474,7 +474,7 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.1) + c.Assert(count, Equals, 7.0) testKit.MustExec("truncate table t") testKit.MustExec("insert into t values (null, null)") From 9e5d424e3edfee3ccbb52991cde0a84d0e61f546 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:24:40 +0800 Subject: [PATCH 14/20] improve comments and variable names --- statistics/histogram.go | 57 +++++++++++++++++++++-------------------- statistics/table.go | 8 +++--- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index 905ec97756796..997ed721734a1 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -549,12 +549,12 @@ func (hg *Histogram) mergeBuckets(bucketIdx int) { } // GetIncreaseFactor get the increase factor to adjust the final estimated count when the table is modified. -func (idx *Index) GetIncreaseFactor(totalCount int64) float64 { +func (idx *Index) GetIncreaseFactor(realtimeRowCount int64) float64 { columnCount := idx.TotalRowCount() if columnCount == 0 { return 1.0 } - return float64(totalCount) / columnCount + return float64(realtimeRowCount) / columnCount } // BetweenRowCount estimates the row count for interval [l, r). @@ -836,6 +836,7 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { // Here we assume the density of data is decreasing from the lower/upper bound of the histogram toward outside. // The maximum row count it can get is the increaseCount. It reaches the maximum when out-of-range width reaches histogram range width. // As it shows below. To calculate the out-of-range row count, we need to calculate the percentage of the shaded area. +// Note that we assume histL-boundL == histR-histL == boundR-histR here. // // /│ │\ // / │ │ \ @@ -905,7 +906,7 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou // keep l and r unchanged, use actualL and actualR to calculate. actualL := l actualR := r - // Handling the out-of-range part on the left of the histogram range + // If the range overlaps with (boundL,histL), we need to handle the out-of-range part on the left of the histogram range if actualL < histL && actualR > boundL { // make sure boundL <= actualL < actualR <= histL if actualL < boundL { @@ -920,7 +921,7 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou actualL = l actualR = r - // Handling the out-of-range part on the right of the histogram range + // If the range overlaps with (histR,boundR), we need to handle the out-of-range part on the right of the histogram range if actualL < boundR && actualR > histR { // make sure histR <= actualL < actualR <= boundR if actualL < histR { @@ -1040,13 +1041,13 @@ func (c *Column) notNullCount() float64 { } // GetIncreaseFactor get the increase factor to adjust the final estimated count when the table is modified. -func (c *Column) GetIncreaseFactor(totalCount int64) float64 { +func (c *Column) GetIncreaseFactor(realtimeRowCount int64) float64 { columnCount := c.TotalRowCount() if columnCount == 0 { // avoid dividing by 0 return 1.0 } - return float64(totalCount) / columnCount + return float64(realtimeRowCount) / columnCount } // MemoryUsage returns the total memory usage of Histogram and CMSketch in Column. @@ -1079,7 +1080,7 @@ func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { return c.TotalRowCount() == 0 || (c.Histogram.NDV > 0 && c.notNullCount() == 0) } -func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, tableRowCount int64) (float64, error) { +func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, encodedVal []byte, realtimeRowCount int64) (float64, error) { if val.IsNull() { return float64(c.NullCount), nil } @@ -1089,7 +1090,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return 0.0, nil } if c.Histogram.NDV > 0 && c.outOfRange(val) { - return outOfRangeEQSelectivity(c.Histogram.NDV, tableRowCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil + return outOfRangeEQSelectivity(c.Histogram.NDV, realtimeRowCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { count, err := queryValue(sc, c.CMSketch, c.TopN, val) @@ -1141,7 +1142,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en } // GetColumnRowCount estimates the row count by a slice of Range. -func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, tableRowCount int64, pkIsHandle bool) (float64, error) { +func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, realtimeRowCount int64, pkIsHandle bool) (float64, error) { var rowCount float64 for _, rg := range ranges { highVal := *rg.HighVal[0].Clone() @@ -1173,12 +1174,12 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range continue } var cnt float64 - cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, tableRowCount) + cnt, err = c.equalRowCount(sc, lowVal, lowEncoded, realtimeRowCount) if err != nil { return 0, errors.Trace(err) } // If the current table row count has changed, we should scale the row count accordingly. - cnt *= c.GetIncreaseFactor(tableRowCount) + cnt *= c.GetIncreaseFactor(realtimeRowCount) rowCount += cnt } continue @@ -1188,12 +1189,12 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // case 2: it's a small range if rangeVals != nil { for _, val := range rangeVals { - cnt, err := c.equalRowCount(sc, val, lowEncoded, tableRowCount) + cnt, err := c.equalRowCount(sc, val, lowEncoded, realtimeRowCount) if err != nil { return 0, err } // If the current table row count has changed, we should scale the row count accordingly. - cnt *= c.GetIncreaseFactor(tableRowCount) + cnt *= c.GetIncreaseFactor(realtimeRowCount) rowCount += cnt } @@ -1206,7 +1207,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // Note that, `cnt` does not include null values, we need specially handle cases // where null is the lower bound. if rg.LowExclude && !lowVal.IsNull() { - lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, tableRowCount) + lowCnt, err := c.equalRowCount(sc, lowVal, lowEncoded, realtimeRowCount) if err != nil { return 0, errors.Trace(err) } @@ -1216,7 +1217,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range cnt += float64(c.NullCount) } if !rg.HighExclude { - highCnt, err := c.equalRowCount(sc, highVal, highEncoded, tableRowCount) + highCnt, err := c.equalRowCount(sc, highVal, highEncoded, realtimeRowCount) if err != nil { return 0, errors.Trace(err) } @@ -1230,11 +1231,11 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range } // If the current table row count has changed, we should scale the row count accordingly. - cnt *= c.GetIncreaseFactor(tableRowCount) + cnt *= c.GetIncreaseFactor(realtimeRowCount) // handling the out-of-range part if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { - increaseCount := tableRowCount - int64(c.TotalRowCount()) + increaseCount := realtimeRowCount - int64(c.TotalRowCount()) if increaseCount < 0 { increaseCount = 0 } @@ -1243,8 +1244,8 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range rowCount += cnt } - if rowCount > float64(tableRowCount) { - rowCount = float64(tableRowCount) + if rowCount > float64(realtimeRowCount) { + rowCount = float64(realtimeRowCount) } else if rowCount < 0 { rowCount = 0 } @@ -1293,7 +1294,7 @@ func (idx *Index) MemoryUsage() (sum int64) { var nullKeyBytes, _ = codec.EncodeKey(nil, nil, types.NewDatum(nil)) -func (idx *Index) equalRowCount(b []byte, tableRowCount int64) float64 { +func (idx *Index) equalRowCount(b []byte, realtimeRowCount int64) float64 { if len(idx.Info.Columns) == 1 { if bytes.Equal(b, nullKeyBytes) { return float64(idx.NullCount) @@ -1302,7 +1303,7 @@ func (idx *Index) equalRowCount(b []byte, tableRowCount int64) float64 { val := types.NewBytesDatum(b) if idx.StatsVer < Version2 { if idx.NDV > 0 && idx.outOfRange(val) { - return outOfRangeEQSelectivity(idx.NDV, tableRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() + return outOfRangeEQSelectivity(idx.NDV, realtimeRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() } if idx.CMSketch != nil { return float64(idx.QueryBytes(b)) @@ -1334,7 +1335,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 { // GetRowCount returns the row count of the given ranges. // It uses the modifyCount to adjust the influence of modifications on the table. -func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, tableRowCount int64) (float64, error) { +func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, realtimeRowCount int64) (float64, error) { totalCount := float64(0) isSingleCol := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { @@ -1358,9 +1359,9 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde totalCount += 1 continue } - count := idx.equalRowCount(lb, tableRowCount) + count := idx.equalRowCount(lb, realtimeRowCount) // If the current table row count has changed, we should scale the row count accordingly. - count *= idx.GetIncreaseFactor(tableRowCount) + count *= idx.GetIncreaseFactor(realtimeRowCount) totalCount += count continue } @@ -1398,19 +1399,19 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, inde } // If the current table row count has changed, we should scale the row count accordingly. - totalCount *= idx.GetIncreaseFactor(tableRowCount) + totalCount *= idx.GetIncreaseFactor(realtimeRowCount) // handling the out-of-range part if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - increaseCount := tableRowCount - int64(idx.TotalRowCount()) + increaseCount := realtimeRowCount - int64(idx.TotalRowCount()) if increaseCount < 0 { increaseCount = 0 } totalCount += idx.Histogram.outOfRangeRowCount(&l, &r, increaseCount) } } - if totalCount > float64(tableRowCount) { - totalCount = float64(tableRowCount) + if totalCount > float64(realtimeRowCount) { + totalCount = float64(realtimeRowCount) } return totalCount, nil } diff --git a/statistics/table.go b/statistics/table.go index 46b1ce47f6c3b..b8c9df6e1aeeb 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -470,8 +470,8 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool { // It assumes all modifications are insertions and all new-inserted rows are uniformly distributed // and has the same distribution with analyzed rows, which means each unique value should have the // same number of rows(Tot/NDV) of it. -func outOfRangeEQSelectivity(ndv, tableRowCount, totalRows int64) float64 { - increaseRowCount := tableRowCount - totalRows +func outOfRangeEQSelectivity(ndv, realtimeRowCount, columnRowCount int64) float64 { + increaseRowCount := realtimeRowCount - columnRowCount if increaseRowCount <= 0 { return 0 // it must be 0 since the histogram contains the whole data } @@ -479,8 +479,8 @@ func outOfRangeEQSelectivity(ndv, tableRowCount, totalRows int64) float64 { ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV } selectivity := 1 / float64(ndv) - if selectivity*float64(totalRows) > float64(increaseRowCount) { - selectivity = float64(increaseRowCount) / float64(totalRows) + if selectivity*float64(columnRowCount) > float64(increaseRowCount) { + selectivity = float64(increaseRowCount) / float64(columnRowCount) } return selectivity } From 6854f6039ea6290b6bf21245e4c3cba60c97a342 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:30:59 +0800 Subject: [PATCH 15/20] add test case --- statistics/testdata/stats_suite_in.json | 12 ++++++ statistics/testdata/stats_suite_out.json | 47 ++++++++++++++++-------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/statistics/testdata/stats_suite_in.json b/statistics/testdata/stats_suite_in.json index c5182f253d76e..2d16ea92385f1 100644 --- a/statistics/testdata/stats_suite_in.json +++ b/statistics/testdata/stats_suite_in.json @@ -185,6 +185,10 @@ "Start": 300, "End": 899 }, + { + "Start": 800, + "End": 1000 + }, { "Start": 900, "End": 1500 @@ -201,6 +205,14 @@ "Start": 100, "End": 200 }, + { + "Start": 200, + "End": 400 + }, + { + "Start": 200, + "End": 1000 + }, { "Start": 0, "End": 100 diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index 0dc6e595ac5ea..d6bd7b8201d5d 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -616,62 +616,67 @@ { "Start": 800, "End": 900, - "Count": 762.7262445771319 + "Count": 774.6790371213752 }, { "Start": 900, "End": 950, - "Count": 247.2699052692704 + "Count": 247.22269781351372 }, { "Start": 950, "End": 1000, - "Count": 226.36695349377365 + "Count": 226.31974603801697 }, { "Start": 1000, "End": 1050, - "Count": 205.46400171827693 + "Count": 205.41679426252026 }, { "Start": 1050, "End": 1100, - "Count": 184.5610499427802 + "Count": 184.5138424870235 }, { "Start": 1150, "End": 1200, - "Count": 142.75514639178675 + "Count": 142.70793893603008 }, { "Start": 1200, "End": 1300, - "Count": 215.07935953500544 + "Count": 215.03215207924876 }, { "Start": 1300, "End": 1400, - "Count": 131.4675524330185 + "Count": 131.42034497726183 }, { "Start": 1400, "End": 1500, - "Count": 47.87246769245203 + "Count": 47.82526023669535 }, { "Start": 1500, "End": 1600, - "Count": 7.722077922077922 + "Count": 7.674870466321244 }, { "Start": 300, "End": 899, "Count": 4500 }, + { + "Start": 800, + "End": 1000, + "Count": 1232.8717400402634 + }, { "Start": 900, "End": 1500, - "Count": 1502.717911267024 + "Count": 1502.6707038112672 }, { "Start": 300, @@ -681,27 +686,37 @@ { "Start": 200, "End": 300, - "Count": 466.750898911986 + "Count": 466.7036914562293 }, { "Start": 100, "End": 200, - "Count": 383.13909180999906 + "Count": 383.0918843542424 + }, + { + "Start": 200, + "End": 400, + "Count": 1213.7036914562293 + }, + { + "Start": 200, + "End": 1000, + "Count": 4500 }, { "Start": 0, "End": 100, - "Count": 299.5272847080122 + "Count": 299.4800772522555 }, { "Start": -100, "End": 100, - "Count": 299.5272847080122 + "Count": 299.4800772522555 }, { "Start": -100, "End": 0, - "Count": 7.722077922077922 + "Count": 7.674870466321244 } ] } From c8043984074a0e291e49806213dc307755312e75 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:53:25 +0800 Subject: [PATCH 16/20] improve `commonPrefixLength` to simplify code --- statistics/histogram.go | 13 ++++--------- statistics/scalar.go | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index 997ed721734a1..2ea58d7c7c949 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -860,15 +860,10 @@ func (hg *Histogram) outOfRangeRowCount(lDatum, rDatum *types.Datum, increaseCou commonPrefix := 0 if hg.GetLower(0).Kind() == types.KindBytes || hg.GetLower(0).Kind() == types.KindString { // Calculate the common prefix length among the lower and upper bound of histogram and the range we want to estimate. - commonPrefix = commonPrefixLength(hg.GetLower(0).GetBytes(), hg.GetUpper(hg.Len()-1).GetBytes()) - commonPrefix2 := commonPrefixLength(lDatum.GetBytes(), rDatum.GetBytes()) - if commonPrefix2 < commonPrefix { - commonPrefix = commonPrefix2 - } - commonPrefix3 := commonPrefixLength(hg.GetLower(0).GetBytes(), lDatum.GetBytes()) - if commonPrefix3 < commonPrefix { - commonPrefix = commonPrefix3 - } + commonPrefix = commonPrefixLength(hg.GetLower(0).GetBytes(), + hg.GetUpper(hg.Len()-1).GetBytes(), + lDatum.GetBytes(), + rDatum.GetBytes()) } // Convert the range we want to estimate to scalar value(float64) diff --git a/statistics/scalar.go b/statistics/scalar.go index df5a85c1631cd..b0eb0bf5c0ea3 100644 --- a/statistics/scalar.go +++ b/statistics/scalar.go @@ -143,14 +143,22 @@ func (hg *Histogram) calcFraction(index int, value *types.Datum) float64 { return 0.5 } -func commonPrefixLength(lower, upper []byte) int { - minLen := len(lower) - if minLen > len(upper) { - minLen = len(upper) +func commonPrefixLength(strs ...[]byte) int { + if len(strs) == 0 { + return -1 + } + minLen := len(strs[0]) + for _, str := range strs { + if len(str) < minLen { + minLen = len(str) + } } for i := 0; i < minLen; i++ { - if lower[i] != upper[i] { - return i + a := strs[0][i] + for _, str := range strs { + if str[i] != a { + return i + } } } return minLen From e78460d0ac83b7804796dd4a8944ae88095db758 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 29 Jul 2021 19:54:29 +0800 Subject: [PATCH 17/20] fix uniform distribution calculation and cleanup code --- statistics/cmsketch.go | 2 +- statistics/histogram.go | 71 ++++++++++++++--------------------- statistics/statistics_test.go | 8 ++-- 3 files changed, 33 insertions(+), 48 deletions(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index c510186b16c40..80a72b77b8c63 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -767,7 +767,7 @@ func MergePartTopN2GlobalTopN(sc *stmtctx.StatementContext, version int, topNs [ datum = d } // Get the row count which the value is equal to the encodedVal from histogram. - count := hists[j].equalRowCount(datum, isIndex) + count, _ := hists[j].equalRowCount(datum, isIndex) if count != 0 { counter[encodedVal] += count // Remove the value corresponding to encodedVal from the histogram. diff --git a/statistics/histogram.go b/statistics/histogram.go index 1e70901665db6..597985e12ecdf 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -425,35 +425,37 @@ func (hg *Histogram) ToString(idxCols int) string { } // equalRowCount estimates the row count where the column equals to value. -func (hg *Histogram) equalRowCount(value types.Datum, hasBucketNDV bool) float64 { +// matched: return true if this returned row count is from Bucket.Repeat or bucket NDV, which is more accurate than if not. +func (hg *Histogram) equalRowCount(value types.Datum, hasBucketNDV bool) (count float64, matched bool) { index, match := hg.Bounds.LowerBound(0, &value) // Since we store the lower and upper bound together, if the index is an odd number, then it points to a upper bound. if index%2 == 1 { if match { - return float64(hg.Buckets[index/2].Repeat) + return float64(hg.Buckets[index/2].Repeat), true } if hasBucketNDV && hg.Buckets[index/2].NDV > 1 { - return float64(hg.bucketCount(index/2)-hg.Buckets[index/2].Repeat) / float64(hg.Buckets[index/2].NDV-1) + return float64(hg.bucketCount(index/2)-hg.Buckets[index/2].Repeat) / float64(hg.Buckets[index/2].NDV-1), true } - return hg.notNullCount() / float64(hg.NDV) + return hg.notNullCount() / float64(hg.NDV), false } if match { cmp := chunk.GetCompareFunc(hg.Tp) if cmp(hg.Bounds.GetRow(index), 0, hg.Bounds.GetRow(index+1), 0) == 0 { - return float64(hg.Buckets[index/2].Repeat) + return float64(hg.Buckets[index/2].Repeat), true } if hasBucketNDV && hg.Buckets[index/2].NDV > 1 { - return float64(hg.bucketCount(index/2)-hg.Buckets[index/2].Repeat) / float64(hg.Buckets[index/2].NDV-1) + return float64(hg.bucketCount(index/2)-hg.Buckets[index/2].Repeat) / float64(hg.Buckets[index/2].NDV-1), true } - return hg.notNullCount() / float64(hg.NDV) + return hg.notNullCount() / float64(hg.NDV), false } - return 0 + return 0, false } // greaterRowCount estimates the row count where the column greater than value. // It's deprecated. Only used for test. func (hg *Histogram) greaterRowCount(value types.Datum) float64 { - gtCount := hg.notNullCount() - hg.lessRowCount(value) - hg.equalRowCount(value, false) + histRowCount, _ := hg.equalRowCount(value, false) + gtCount := hg.notNullCount() - hg.lessRowCount(value) - histRowCount return math.Max(0, gtCount) } @@ -1091,7 +1093,8 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en count, err := queryValue(sc, c.CMSketch, c.TopN, val) return float64(count), errors.Trace(err) } - return c.Histogram.equalRowCount(val, false), nil + histRowCount, _ := c.Histogram.equalRowCount(val, false) + return histRowCount, nil } // Stats version == 2 @@ -1106,34 +1109,13 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return float64(rowcount), nil } } - // 2. try to find this value in bucket.repeats(the last value in every bucket) - index, match := c.Histogram.Bounds.LowerBound(0, &val) - if index%2 == 1 && match { - return float64(c.Histogram.Buckets[index/2].Repeat), nil - } - if match { - cmp := chunk.GetCompareFunc(c.Histogram.Tp) - if cmp(c.Histogram.Bounds.GetRow(index), 0, c.Histogram.Bounds.GetRow(index+1), 0) == 0 { - return float64(c.Histogram.Buckets[index/2].Repeat), nil - } + // 2. try to find this value in bucket.Repeat(the last value in every bucket) + histCnt, matched := c.Histogram.equalRowCount(val, true) + if matched { + return histCnt, nil } // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - cnt := c.Histogram.notNullCount() - for _, bkt := range c.Histogram.Buckets { - if cnt <= float64(bkt.Repeat) { - return 0, nil - } - cnt -= float64(bkt.Repeat) - } - topNLen := int64(0) - if c.TopN != nil { - topNLen = int64(len(c.TopN.TopN)) - } - ndv := c.Histogram.NDV - topNLen - int64(len(c.Histogram.Buckets)) - if ndv <= 0 { - return 0, nil - } - return cnt / float64(ndv), nil + return c.Histogram.notNullCount() / float64(c.Histogram.NDV-int64(len(c.TopN.TopN))), nil } // GetColumnRowCount estimates the row count by a slice of Range. @@ -1307,20 +1289,22 @@ func (idx *Index) equalRowCount(b []byte, realtimeRowCount int64) float64 { if idx.CMSketch != nil { return float64(idx.QueryBytes(b)) } - return idx.Histogram.equalRowCount(val, false) + histRowCount, _ := idx.Histogram.equalRowCount(val, false) + return histRowCount } // stats version == 2 - // query the top-n first. + // 1. try to find this value in TopN count, found := idx.TopN.QueryTopN(b) if found { return float64(count) } - histCnt := idx.Histogram.equalRowCount(val, true) - if histCnt > 0 { + // 2. try to find this value in bucket.Repeat(the last value in every bucket) + histCnt, matched := idx.Histogram.equalRowCount(val, true) + if matched { return histCnt } - // the out-of-range case: - return idx.notNullCount() / float64(idx.NDV) + // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) + return idx.Histogram.notNullCount() / float64(idx.NDV-int64(len(idx.TopN.TopN))) } // QueryBytes is used to query the count of specified bytes. @@ -2126,7 +2110,8 @@ func MergePartitionHist2GlobalHist(sc *stmtctx.StatementContext, hists []*Histog for _, bucket := range globalBuckets { var repeat float64 for _, hist := range hists { - repeat += hist.equalRowCount(*bucket.upper, isIndex) // only hists of indexes have bucket.NDV + histRowCount, _ := hist.equalRowCount(*bucket.upper, isIndex) + repeat += histRowCount // only hists of indexes have bucket.NDV } if int64(repeat) > bucket.Repeat { bucket.Repeat = int64(repeat) diff --git a/statistics/statistics_test.go b/statistics/statistics_test.go index f1de62b07cd29..a6f1d31afbec7 100644 --- a/statistics/statistics_test.go +++ b/statistics/statistics_test.go @@ -259,7 +259,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { checkRepeats(c, col) col.PreCalculateScalar() c.Check(col.Len(), Equals, 226) - count := col.equalRowCount(types.NewIntDatum(1000), false) + count, _ := col.equalRowCount(types.NewIntDatum(1000), false) c.Check(int(count), Equals, 0) count = col.lessRowCount(types.NewIntDatum(1000)) c.Check(int(count), Equals, 10000) @@ -271,7 +271,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { c.Check(int(count), Equals, 100000) count = col.greaterRowCount(types.NewIntDatum(200000000)) c.Check(count, Equals, 0.0) - count = col.equalRowCount(types.NewIntDatum(200000000), false) + count, _ = col.equalRowCount(types.NewIntDatum(200000000), false) c.Check(count, Equals, 0.0) count = col.BetweenRowCount(types.NewIntDatum(3000), types.NewIntDatum(3500)) c.Check(int(count), Equals, 4994) @@ -327,7 +327,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { checkRepeats(c, col) col.PreCalculateScalar() c.Check(int(tblCount), Equals, 100000) - count = col.equalRowCount(encodeKey(types.NewIntDatum(10000)), false) + count, _ = col.equalRowCount(encodeKey(types.NewIntDatum(10000)), false) c.Check(int(count), Equals, 1) count = col.lessRowCount(encodeKey(types.NewIntDatum(20000))) c.Check(int(count), Equals, 19999) @@ -344,7 +344,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) { checkRepeats(c, col) col.PreCalculateScalar() c.Check(int(tblCount), Equals, 100000) - count = col.equalRowCount(types.NewIntDatum(10000), false) + count, _ = col.equalRowCount(types.NewIntDatum(10000), false) c.Check(int(count), Equals, 1) count = col.lessRowCount(types.NewIntDatum(20000)) c.Check(int(count), Equals, 20000) From 6dd8a95751587ae4c4d01334397bb7c61c59cb99 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 30 Jul 2021 12:02:32 +0800 Subject: [PATCH 18/20] fix test --- statistics/histogram.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index 597985e12ecdf..31acce2cbd050 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -1115,7 +1115,11 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return histCnt, nil } // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - return c.Histogram.notNullCount() / float64(c.Histogram.NDV-int64(len(c.TopN.TopN))), nil + histNDV := float64(c.Histogram.NDV - int64(len(c.TopN.TopN))) + if histNDV <= 0 { + return 0, nil + } + return c.Histogram.notNullCount() / histNDV, nil } // GetColumnRowCount estimates the row count by a slice of Range. @@ -1304,7 +1308,11 @@ func (idx *Index) equalRowCount(b []byte, realtimeRowCount int64) float64 { return histCnt } // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - return idx.Histogram.notNullCount() / float64(idx.NDV-int64(len(idx.TopN.TopN))) + histNDV := float64(idx.Histogram.NDV - int64(len(idx.TopN.TopN))) + if histNDV <= 0 { + return 0 + } + return idx.Histogram.notNullCount() / histNDV } // QueryBytes is used to query the count of specified bytes. From ba43c325001515ed70a62239058555c8d365439c Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 30 Jul 2021 12:36:09 +0800 Subject: [PATCH 19/20] fix test --- statistics/histogram.go | 4 +- statistics/testdata/stats_suite_out.json | 90 ++++++++++++------------ 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index 31acce2cbd050..7b2c7610b4b7d 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -1115,7 +1115,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, en return histCnt, nil } // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - histNDV := float64(c.Histogram.NDV - int64(len(c.TopN.TopN))) + histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { return 0, nil } @@ -1308,7 +1308,7 @@ func (idx *Index) equalRowCount(b []byte, realtimeRowCount int64) float64 { return histCnt } // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - histNDV := float64(idx.Histogram.NDV - int64(len(idx.TopN.TopN))) + histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) if histNDV <= 0 { return 0 } diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index f4c1cb8a2c133..c4135c1f8b0b3 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -160,8 +160,8 @@ " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ - "TableReader_7 0.75 root data:Selection_6", - "└─Selection_6 0.75 cop[tikv] eq(test.tint.a, 4)", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tint.a, 4)", " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ @@ -175,9 +175,9 @@ "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdouble keep order:false" ], [ - "IndexLookUp_10 0.75 root ", - "├─IndexRangeScan_8(Build) 0.75 cop[tikv] table:tdouble, index:singular(a) range:[4,4], keep order:false", - "└─TableRowIDScan_9(Probe) 0.75 cop[tikv] table:tdouble keep order:false" + "IndexLookUp_10 1.00 root ", + "├─IndexRangeScan_8(Build) 1.00 cop[tikv] table:tdouble, index:singular(a) range:[4,4], keep order:false", + "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdouble keep order:false" ], [ "IndexLookUp_10 1.00 root ", @@ -190,9 +190,9 @@ "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdecimal keep order:false" ], [ - "IndexLookUp_10 0.75 root ", - "├─IndexRangeScan_8(Build) 0.75 cop[tikv] table:tdecimal, index:singular(a) range:[4.00000000000000000000,4.00000000000000000000], keep order:false", - "└─TableRowIDScan_9(Probe) 0.75 cop[tikv] table:tdecimal keep order:false" + "IndexLookUp_10 1.00 root ", + "├─IndexRangeScan_8(Build) 1.00 cop[tikv] table:tdecimal, index:singular(a) range:[4.00000000000000000000,4.00000000000000000000], keep order:false", + "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdecimal keep order:false" ], [ "IndexLookUp_10 1.00 root ", @@ -205,8 +205,8 @@ " └─TableFullScan_5 8.00 cop[tikv] table:tstring keep order:false" ], [ - "TableReader_7 0.75 root data:Selection_6", - "└─Selection_6 0.75 cop[tikv] eq(test.tstring.a, \"4\")", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tstring.a, \"4\")", " └─TableFullScan_5 8.00 cop[tikv] table:tstring keep order:false" ], [ @@ -240,8 +240,8 @@ " └─TableFullScan_5 6.00 cop[tikv] table:tprefix keep order:false" ], [ - "TableReader_7 0.67 root data:Selection_6", - "└─Selection_6 0.67 cop[tikv] eq(test.tprefix.a, \"888\")", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tprefix.a, \"888\")", " └─TableFullScan_5 6.00 cop[tikv] table:tprefix keep order:false" ], [ @@ -250,8 +250,8 @@ " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ - "TableReader_7 0.75 root data:Selection_6", - "└─Selection_6 0.75 cop[tikv] eq(test.tint.b, 4), eq(test.tint.c, 4)", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tint.b, 4), eq(test.tint.c, 4)", " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ @@ -265,9 +265,9 @@ "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdouble keep order:false" ], [ - "IndexLookUp_10 0.75 root ", - "├─IndexRangeScan_8(Build) 0.75 cop[tikv] table:tdouble, index:multi(b, c) range:[4 4,4 4], keep order:false", - "└─TableRowIDScan_9(Probe) 0.75 cop[tikv] table:tdouble keep order:false" + "IndexLookUp_10 1.00 root ", + "├─IndexRangeScan_8(Build) 1.00 cop[tikv] table:tdouble, index:multi(b, c) range:[4 4,4 4], keep order:false", + "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdouble keep order:false" ], [ "IndexLookUp_10 1.00 root ", @@ -280,9 +280,9 @@ "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdecimal keep order:false" ], [ - "IndexLookUp_10 0.75 root ", - "├─IndexRangeScan_8(Build) 0.75 cop[tikv] table:tdecimal, index:multi(b, c) range:[4.00000000000000000000 4.00000000000000000000,4.00000000000000000000 4.00000000000000000000], keep order:false", - "└─TableRowIDScan_9(Probe) 0.75 cop[tikv] table:tdecimal keep order:false" + "IndexLookUp_10 1.00 root ", + "├─IndexRangeScan_8(Build) 1.00 cop[tikv] table:tdecimal, index:multi(b, c) range:[4.00000000000000000000 4.00000000000000000000,4.00000000000000000000 4.00000000000000000000], keep order:false", + "└─TableRowIDScan_9(Probe) 1.00 cop[tikv] table:tdecimal keep order:false" ], [ "IndexLookUp_10 1.00 root ", @@ -295,8 +295,8 @@ " └─TableFullScan_5 8.00 cop[tikv] table:tstring keep order:false" ], [ - "TableReader_7 0.75 root data:Selection_6", - "└─Selection_6 0.75 cop[tikv] eq(test.tstring.b, \"4\"), eq(test.tstring.c, \"4\")", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tstring.b, \"4\"), eq(test.tstring.c, \"4\")", " └─TableFullScan_5 8.00 cop[tikv] table:tstring keep order:false" ], [ @@ -456,16 +456,16 @@ "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[1,1], keep order:false" ], [ - "IndexReader_6 0.60 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 0.60 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false" + "IndexReader_6 1.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 1.00 cop[tikv] table:topn_before_hist, index:idx(a) range:[2,2], keep order:false" ], [ "IndexReader_6 4.00 root index:IndexRangeScan_5", "└─IndexRangeScan_5 4.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[7,7], keep order:false" ], [ - "IndexReader_6 0.60 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 0.60 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" + "IndexReader_6 1.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 1.00 cop[tikv] table:topn_after_hist, index:idx(a) range:[6,6], keep order:false" ], [ "TableReader_7 4.00 root data:Selection_6", @@ -656,67 +656,67 @@ { "Start": 800, "End": 900, - "Count": 774.6790371213752 + "Count": 743.004166655054 }, { "Start": 900, "End": 950, - "Count": 247.22269781351372 + "Count": 247.04782734719248 }, { "Start": 950, "End": 1000, - "Count": 226.31974603801697 + "Count": 226.14487557169574 }, { "Start": 1000, "End": 1050, - "Count": 205.41679426252026 + "Count": 205.24192379619902 }, { "Start": 1050, "End": 1100, - "Count": 184.5138424870235 + "Count": 184.33897202070227 }, { "Start": 1150, "End": 1200, - "Count": 142.70793893603008 + "Count": 142.53306846970884 }, { "Start": 1200, "End": 1300, - "Count": 215.03215207924876 + "Count": 214.85728161292752 }, { "Start": 1300, "End": 1400, - "Count": 131.42034497726183 + "Count": 131.2454745109406 }, { "Start": 1400, "End": 1500, - "Count": 47.82526023669535 + "Count": 47.650389770374105 }, { "Start": 1500, "End": 1600, - "Count": 7.674870466321244 + "Count": 7.5 }, { "Start": 300, "End": 899, - "Count": 4500 + "Count": 4498.5 }, { "Start": 800, "End": 1000, - "Count": 1232.8717400402634 + "Count": 1201.196869573942 }, { "Start": 900, "End": 1500, - "Count": 1502.6707038112672 + "Count": 1502.495833344946 }, { "Start": 300, @@ -726,17 +726,17 @@ { "Start": 200, "End": 300, - "Count": 466.7036914562293 + "Count": 466.52882098990807 }, { "Start": 100, "End": 200, - "Count": 383.0918843542424 + "Count": 382.91701388792114 }, { "Start": 200, "End": 400, - "Count": 1213.7036914562293 + "Count": 1211.5288209899081 }, { "Start": 200, @@ -746,17 +746,17 @@ { "Start": 0, "End": 100, - "Count": 299.4800772522555 + "Count": 299.3052067859343 }, { "Start": -100, "End": 100, - "Count": 299.4800772522555 + "Count": 299.3052067859343 }, { "Start": -100, "End": 0, - "Count": 7.674870466321244 + "Count": 7.5 } ] } From 568dbd99bcc584377774568c88916de58c1a92d4 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 30 Jul 2021 14:47:55 +0800 Subject: [PATCH 20/20] small fixup --- statistics/scalar.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statistics/scalar.go b/statistics/scalar.go index b0eb0bf5c0ea3..e8f585f450a32 100644 --- a/statistics/scalar.go +++ b/statistics/scalar.go @@ -145,7 +145,7 @@ func (hg *Histogram) calcFraction(index int, value *types.Datum) float64 { func commonPrefixLength(strs ...[]byte) int { if len(strs) == 0 { - return -1 + return 0 } minLen := len(strs[0]) for _, str := range strs {