Skip to content

Commit

Permalink
planner: fix the inappropriate heuristic rule to estimate the EQ sele…
Browse files Browse the repository at this point in the history
…ctivity when out of range (#18543) (#18997)

Signed-off-by: ti-srebot <[email protected]>
  • Loading branch information
ti-srebot authored Sep 1, 2020
1 parent d76bab9 commit 83fc2d8
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 34 deletions.
18 changes: 9 additions & 9 deletions cmd/explaintest/r/explain_union_scan.result
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ Limit_20 10.00 root offset:0, count:10
└─HashJoin_22 10.00 root left outer join, equal:[eq(test.city.province_id, test.city.province_id)]
├─Limit_25(Build) 10.00 root offset:0, count:10
│ └─IndexJoin_38 10.00 root inner join, inner:UnionScan_37, outer key:test.city.id, inner key:test.city.id
│ ├─UnionScan_47(Build) 10.33 root
│ │ └─TableReader_49 10.33 root data:TableFullScan_48
│ │ └─TableFullScan_48 10.33 cop[tikv] table:t2 keep order:false
│ └─UnionScan_37(Probe) 0.97 root gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─IndexLookUp_36 0.97 root
│ ├─UnionScan_47(Build) 10.00 root
│ │ └─TableReader_49 10.00 root data:TableFullScan_48
│ │ └─TableFullScan_48 10.00 cop[tikv] table:t2 keep order:false
│ └─UnionScan_37(Probe) 1.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─IndexLookUp_36 1.00 root
│ ├─IndexRangeScan_33(Build) 1.00 cop[tikv] table:t1, index:PRIMARY(id) range: decided by [eq(test.city.id, test.city.id)], keep order:false
│ └─Selection_35(Probe) 0.97 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─Selection_35(Probe) 1.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100)
│ └─TableRowIDScan_34 1.00 cop[tikv] table:t1 keep order:false
└─UnionScan_57(Probe) 519304.44 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─TableReader_60 519304.44 root data:Selection_59
└─Selection_59 519304.44 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─UnionScan_57(Probe) 536284.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─TableReader_60 536284.00 root data:Selection_59
└─Selection_59 536284.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id))
└─TableFullScan_58 536284.00 cop[tikv] table:t3 keep order:false
commit;
4 changes: 2 additions & 2 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) {
c.Assert(h.Update(dom.InfoSchema()), IsNil)
statistics.RatioOfPseudoEstimate.Store(10.0)
testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows(
"TableReader_7 35.91 root data:Selection_6",
"└─Selection_6 35.91 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)",
"TableReader_7 29.77 root data:Selection_6",
"└─Selection_6 29.77 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)",
" └─TableFullScan_5 80.00 cop[tikv] table:t keep order:false",
))
statistics.RatioOfPseudoEstimate.Store(0.7)
Expand Down
13 changes: 7 additions & 6 deletions planner/core/testdata/analyze_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -347,17 +347,18 @@
{
"SQL": "explain select * from t where a = 7639902",
"Plan": [
"IndexReader_6 2.03 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
"IndexReader_6 6.68 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
]
},
{
"SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6",
"Plan": [
"Projection_7 2.03 root test.t.c, test.t.b",
"└─TopN_10 2.03 root test.t.b:asc, offset:0, count:6",
" └─IndexReader_18 2.03 root index:IndexRangeScan_17",
" └─IndexRangeScan_17 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
"Projection_7 6.00 root test.t.c, test.t.b",
"└─TopN_8 6.00 root test.t.b:asc, offset:0, count:6",
" └─IndexReader_16 6.00 root index:TopN_15",
" └─TopN_15 6.00 cop[tikv] test.t.b:asc, offset:0, count:6",
" └─IndexRangeScan_14 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false"
]
}
]
Expand Down
4 changes: 2 additions & 2 deletions statistics/handle/update_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1524,8 +1524,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) {
sql: "select * from t where a = 2 and b > 10",
hist: "column:2 ndv:20 totColSize:20\n" +
"num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" +
"num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
"num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" +
"num: 5 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0",
rangeID: tblInfo.Columns[1].ID,
idxID: tblInfo.Indices[0].ID,
eqCount: 3,
Expand Down
13 changes: 7 additions & 6 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo
return 0.0, nil
}
if c.NDV > 0 && c.outOfRange(val) {
return float64(modifyCount) / float64(c.NDV), nil
return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
Expand Down Expand Up @@ -810,9 +810,10 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range
continue
}
// The interval case.
cnt := c.BetweenRowCount(rg.LowVal[0], rg.HighVal[0])
if (c.outOfRange(rg.LowVal[0]) && !rg.LowVal[0].IsNull()) || c.outOfRange(rg.HighVal[0]) {
cnt += float64(modifyCount) / outOfRangeBetweenRate
lowVal, highVal := rg.LowVal[0], rg.HighVal[0]
cnt := c.BetweenRowCount(lowVal, highVal)
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount()
}
// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here.
// Note that, `cnt` does not include null values, we need specially handle cases
Expand Down Expand Up @@ -874,7 +875,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo
}
val := types.NewBytesDatum(b)
if idx.NDV > 0 && idx.outOfRange(val) {
return float64(modifyCount) / (float64(idx.NDV)), nil
return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil
}
if idx.CMSketch != nil {
return float64(idx.CMSketch.QueryBytes(b)), nil
Expand Down Expand Up @@ -926,7 +927,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range
totalCount += idx.BetweenRowCount(l, r)
lowIsNull := bytes.Equal(lb, nullKeyBytes)
if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
totalCount += float64(modifyCount) / outOfRangeBetweenRate
totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount()
}
if isSingleCol && lowIsNull {
totalCount += float64(idx.NullCount)
Expand Down
34 changes: 31 additions & 3 deletions statistics/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,34 @@ func getRange(start, end int64) []*ranger.Range {
return []*ranger.Range{ran}
}

func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) {
defer cleanEnv(c, s.store, s.do)
testKit := testkit.NewTestKit(c, s.store)
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t(a int)")
for i := 0; i < 1000; i++ {
testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249
}
testKit.MustExec("analyze table t")

h := s.do.StatsHandle()
table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
statsTbl := h.GetTableStats(table.Meta())
sc := &stmtctx.StatementContext{}
col := statsTbl.Columns[table.Meta().Columns[0].ID]
count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false)
c.Assert(err, IsNil)
c.Assert(count, Equals, float64(0))

for i := 0; i < 8; i++ {
count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false)
c.Assert(err, IsNil)
c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt
}
}

func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
defer cleanEnv(c, s.store, s.do)
testKit := testkit.NewTestKit(c, s.store)
Expand All @@ -395,15 +423,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) {
colID := table.Meta().Columns[0].ID
count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 2.0)
c.Assert(count, Equals, 0.2)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64))
c.Assert(err, IsNil)
c.Assert(count, Equals, 4.2)
c.Assert(count, Equals, 2.4000000000000004)

idxID := table.Meta().Indices[0].ID
count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30))
Expand Down
26 changes: 20 additions & 6 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,24 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool {
return false
}

// outOfRangeEQSelectivity estimates selectivities for out-of-range values.
// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed
// and has the same distribution with analyzed rows, which means each unique value should have the
// same number of rows(Tot/NDV) of it.
func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 {
if modifyRows == 0 {
return 0 // it must be 0 since the histogram contains the whole data
}
if ndv < outOfRangeBetweenRate {
ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV
}
selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here.
if selectivity*float64(totalRows) > float64(modifyRows) {
selectivity = float64(modifyRows) / float64(totalRows)
}
return selectivity
}

// getEqualCondSelectivity gets the selectivity of the equal conditions.
func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedColsLen int) float64 {
coverAll := len(idx.Info.Columns) == usedColsLen
Expand All @@ -404,8 +422,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
// for equality queries
return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount()
return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount()))
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
Expand All @@ -416,10 +433,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols
}
ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV)
}
if ndv > 0 {
return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount()
}
return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount()
return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount()))
}
return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount())
}
Expand Down

0 comments on commit 83fc2d8

Please sign in to comment.