Skip to content

Commit

Permalink
*: improve NULL count estimation for single column index (#9474) (#9979)
Browse files Browse the repository at this point in the history
  • Loading branch information
eurekaka authored and zz-jason committed Apr 2, 2019
1 parent 46d5d0e commit 59eb553
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 104 deletions.
4 changes: 2 additions & 2 deletions cmd/explaintest/r/index_join.result
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ insert into t2 values(1, 1);
analyze table t1, t2;
explain select /*+ TIDB_INLJ(t1, t2) */ * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
└─TableReader_16 1.00 root data:TableScan_15
└─TableScan_15 1.00 cop table:t2, range:[-inf,+inf], keep order:false
explain select * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
Expand Down
98 changes: 76 additions & 22 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
Cms: []*statistics.CMSketch{cms},
IsIndex: 1,
}
result.Count = hist.NullCount
if hist.Len() > 0 {
result.Count = hist.Buckets[hist.Len()-1].Count
result.Count += hist.Buckets[hist.Len()-1].Count
}
return result
}
Expand All @@ -165,12 +166,16 @@ type AnalyzeIndexExec struct {
priority int
analyzePB *tipb.AnalyzeReq
result distsql.SelectResult
countNullRes distsql.SelectResult
maxNumBuckets uint64
}

func (e *AnalyzeIndexExec) open() error {
// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
// special null range for single-column index to get the null count.
func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
var builder distsql.RequestBuilder
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
SetAnalyzeRequest(e.analyzePB).
SetKeepOrder(true).
SetConcurrency(e.concurrency).
Expand All @@ -179,29 +184,51 @@ func (e *AnalyzeIndexExec) open() error {
return errors.Trace(err)
}
ctx := context.TODO()
e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
if err != nil {
return errors.Trace(err)
}
e.result.Fetch(ctx)
result.Fetch(ctx)
if isNullRange {
e.countNullRes = result
} else {
e.result = result
}
return nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, errors.Trace(err)
func (e *AnalyzeIndexExec) open() error {
ranges := ranger.FullRange()
// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
// multi-column index is always 0 then.
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.FullNotNullRange()
}
defer func() {
if err1 := e.result.Close(); err1 != nil {
hist = nil
cms = nil
err = errors.Trace(err1)
err := e.fetchAnalyzeResult(ranges, false)
if err != nil {
return err
}
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.NullRange()
err = e.fetchAnalyzeResult(ranges, true)
if err != nil {
return err
}
}()
hist = &statistics.Histogram{}
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
return nil
}

func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
hist := &statistics.Histogram{}
var cms *statistics.CMSketch
if needCMS {
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
for {
data, err := e.result.NextRaw(context.TODO())
data, err := result.NextRaw(context.TODO())
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand All @@ -215,15 +242,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
if err != nil {
return nil, nil, errors.Trace(err)
return nil, nil, err
}
if resp.Cms != nil {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
if needCMS {
if resp.Cms == nil {
log.Warnf("nil CMS in response, table is %s, index is %s", e.idxInfo.Table.O, e.idxInfo.Name.O)
} else {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
}
}
}
}
return hist, cms, nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, err
}
defer func() {
err = closeAll(e.result, e.countNullRes)
}()
hist, cms, err = e.buildStatsFromResult(e.result, true)
if err != nil {
return nil, nil, err
}
if e.countNullRes != nil {
nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
if err != nil {
return nil, nil, err
}
if l := nullHist.Len(); l > 0 {
hist.NullCount = nullHist.Buckets[l-1].Count
}
}
hist.ID = e.idxInfo.ID
return hist, cms, nil
}
Expand Down
4 changes: 2 additions & 2 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ PARTITION BY RANGE ( a ) (
for _, def := range pi.Definitions {
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
for _, col := range statsTbl.Columns {
c.Assert(col.Len(), Greater, 0)
Expand All @@ -81,7 +81,7 @@ PARTITION BY RANGE ( a ) (
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
if i == 0 {
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
} else {
c.Assert(statsTbl.Pseudo, IsTrue)
Expand Down
53 changes: 52 additions & 1 deletion executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,56 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
tk.MustExec("create table t (a int, index idx(a))")
tk.MustExec("insert into t values(NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL"))
// Null values are excluded from histogram for single-column index.
tk.MustQuery("show stats_buckets").Check(testkit.Rows())
tk.MustExec("insert into t values(1)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
))
tk.MustExec("drop table t")
tk.MustExec("create table t (a int, b int, index idx(a, b))")
tk.MustExec("insert into t values(NULL, NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))

tk.MustExec("drop table t")
tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res := tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_b")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][6], Equals, "4")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_c_a")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][6], Equals, "0")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
c.Assert(len(res.Rows()), Equals, 0)
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res = tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 2)
c.Assert(res.Rows()[0][6], Equals, "4")
c.Assert(res.Rows()[1][6], Equals, "0")
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
tk.MustExec("analyze table t")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 5)
c.Assert(res.Rows()[0][6], Equals, "1")
c.Assert(res.Rows()[1][6], Equals, "4")
c.Assert(res.Rows()[2][6], Equals, "1")
c.Assert(res.Rows()[3][6], Equals, "4")
c.Assert(res.Rows()[4][6], Equals, "0")
}
14 changes: 7 additions & 7 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ func (s *testAnalyzeSuite) TestIndexRead(c *C) {
},
{
sql: "select count(*) from t where e > 1 group by b",
best: "IndexLookUp(Index(t.b)[[NULL,+inf]], Table(t)->Sel([gt(test.t.e, 1)]))->StreamAgg",
best: "TableReader(Table(t)->Sel([gt(test.t.e, 1)])->HashAgg)->HashAgg",
},
{
sql: "select count(e) from t where t.b <= 20",
Expand Down Expand Up @@ -453,7 +453,7 @@ func (s *testAnalyzeSuite) TestAnalyze(c *C) {
}{
{
sql: "analyze table t3",
best: "Analyze{Index(a),Table(b)}",
best: "Analyze{Index(a),Table(a, b)}",
},
// Test analyze full table.
{
Expand Down Expand Up @@ -676,11 +676,11 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
" ├─TableReader_12 10.00 root data:TableScan_11",
" │ └─TableScan_11 10.00 cop table:t, range:[-inf,+inf], keep order:false",
" └─MaxOneRow_13 1.00 root ",
" └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.80 root ",
" ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.80 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.25 cop table:t, keep order:false",
" └─Projection_14 0.10 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.10 root ",
" ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.10 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.00 cop table:t, keep order:false",
))
}

Expand Down
29 changes: 5 additions & 24 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -643,35 +643,16 @@ func (b *planBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
// getColsInfo returns the info of index columns, normal columns and primary key.
func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
tbl := tn.TableInfo
// idxNames contains all the normal columns that can be analyzed more effectively, because those columns occur as index
// columns or primary key columns with integer type.
var idxNames []string
if tbl.PKIsHandle {
for _, col := range tbl.Columns {
if mysql.HasPriKeyFlag(col.Flag) {
idxNames = append(idxNames, col.Name.L)
pkCol = col
}
for _, col := range tbl.Columns {
if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
} else {
colsInfo = append(colsInfo, col)
}
}
for _, idx := range tn.TableInfo.Indices {
if idx.State == model.StatePublic {
indicesInfo = append(indicesInfo, idx)
if len(idx.Columns) == 1 {
idxNames = append(idxNames, idx.Columns[0].Name.L)
}
}
}
for _, col := range tbl.Columns {
isIndexCol := false
for _, idx := range idxNames {
if idx == col.Name.L {
isIndexCol = true
break
}
}
if !isIndexCol {
colsInfo = append(colsInfo, col)
}
}
return
Expand Down
12 changes: 11 additions & 1 deletion statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
}

func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCount float64) error {
lowIsNull := ran.LowVal[0].IsNull()
if q.tp == indexType {
sc := &stmtctx.StatementContext{TimeZone: time.UTC}
lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
Expand All @@ -1099,8 +1100,17 @@ func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCou
ranges := q.hist.SplitRange([]*ranger.Range{ran})
counts := make([]float64, 0, len(ranges))
sum := 0.0
for _, r := range ranges {
for i, r := range ranges {
// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
// its result of boundary values.
count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
// does not include null values of lower bound.
if i == 0 && lowIsNull {
count += float64(q.hist.NullCount)
}
sum += count
counts = append(counts, count)
}
Expand Down
Loading

0 comments on commit 59eb553

Please sign in to comment.