*: improve NULL count estimation for single column index (#9474) (#9979)

pingcap · Apr 2, 2019 · 59eb553 · 59eb553
1 parent 46d5d0e
commit 59eb553
Show file tree

Hide file tree

Showing 12 changed files with 303 additions and 104 deletions.
diff --git a/cmd/explaintest/r/index_join.result b/cmd/explaintest/r/index_join.result
@@ -6,15 +6,15 @@ insert into t2 values(1, 1);
 analyze table t1, t2;
 explain select /*+ TIDB_INLJ(t1, t2) */ * from t1 join t2 on t1.a=t2.a;
 id	count	task	operator info
-IndexJoin_14	1.25	root	inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
+IndexJoin_14	5.00	root	inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
 ├─IndexLookUp_13	5.00	root	
 │ ├─IndexScan_11	5.00	cop	table:t1, index:a, range: decided by [test.t2.a], keep order:false
 │ └─TableScan_12	5.00	cop	table:t1, keep order:false
 └─TableReader_16	1.00	root	data:TableScan_15
   └─TableScan_15	1.00	cop	table:t2, range:[-inf,+inf], keep order:false
 explain select * from t1 join t2 on t1.a=t2.a;
 id	count	task	operator info
-IndexJoin_14	1.25	root	inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
+IndexJoin_14	5.00	root	inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
 ├─IndexLookUp_13	5.00	root	
 │ ├─IndexScan_11	5.00	cop	table:t1, index:a, range: decided by [test.t2.a], keep order:false
 │ └─TableScan_12	5.00	cop	table:t1, keep order:false

diff --git a/executor/analyze.go b/executor/analyze.go
@@ -150,8 +150,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
 		Cms:             []*statistics.CMSketch{cms},
 		IsIndex:         1,
 	}
+	result.Count = hist.NullCount
 	if hist.Len() > 0 {
-		result.Count = hist.Buckets[hist.Len()-1].Count
+		result.Count += hist.Buckets[hist.Len()-1].Count
 	}
 	return result
 }
@@ -165,12 +166,16 @@ type AnalyzeIndexExec struct {
 	priority        int
 	analyzePB       *tipb.AnalyzeReq
 	result          distsql.SelectResult
+	countNullRes    distsql.SelectResult
 	maxNumBuckets   uint64
 }
 
-func (e *AnalyzeIndexExec) open() error {
+// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
+// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
+// special null range for single-column index to get the null count.
+func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
 	var builder distsql.RequestBuilder
-	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
+	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
 		SetAnalyzeRequest(e.analyzePB).
 		SetKeepOrder(true).
 		SetConcurrency(e.concurrency).
@@ -179,29 +184,51 @@ func (e *AnalyzeIndexExec) open() error {
 		return errors.Trace(err)
 	}
 	ctx := context.TODO()
-	e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
+	result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
 	if err != nil {
 		return errors.Trace(err)
 	}
-	e.result.Fetch(ctx)
+	result.Fetch(ctx)
+	if isNullRange {
+		e.countNullRes = result
+	} else {
+		e.result = result
+	}
 	return nil
 }
 
-func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
-	if err = e.open(); err != nil {
-		return nil, nil, errors.Trace(err)
+func (e *AnalyzeIndexExec) open() error {
+	ranges := ranger.FullRange()
+	// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
+	// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
+	// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
+	// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
+	// multi-column index is always 0 then.
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.FullNotNullRange()
 	}
-	defer func() {
-		if err1 := e.result.Close(); err1 != nil {
-			hist = nil
-			cms = nil
-			err = errors.Trace(err1)
+	err := e.fetchAnalyzeResult(ranges, false)
+	if err != nil {
+		return err
+	}
+	if len(e.idxInfo.Columns) == 1 {
+		ranges = ranger.NullRange()
+		err = e.fetchAnalyzeResult(ranges, true)
+		if err != nil {
+			return err
 		}
-	}()
-	hist = &statistics.Histogram{}
-	cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
+	return nil
+}
+
+func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
+	hist := &statistics.Histogram{}
+	var cms *statistics.CMSketch
+	if needCMS {
+		cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
+	}
 	for {
-		data, err := e.result.NextRaw(context.TODO())
+		data, err := result.NextRaw(context.TODO())
 		if err != nil {
 			return nil, nil, errors.Trace(err)
 		}
@@ -215,15 +242,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
 		}
 		hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
 		if err != nil {
-			return nil, nil, errors.Trace(err)
+			return nil, nil, err
 		}
-		if resp.Cms != nil {
-			err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
-			if err != nil {
-				return nil, nil, errors.Trace(err)
+		if needCMS {
+			if resp.Cms == nil {
+				log.Warnf("nil CMS in response, table is %s, index is %s", e.idxInfo.Table.O, e.idxInfo.Name.O)
+			} else {
+				err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
+				if err != nil {
+					return nil, nil, errors.Trace(err)
+				}
 			}
 		}
 	}
+	return hist, cms, nil
+}
+
+func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
+	if err = e.open(); err != nil {
+		return nil, nil, err
+	}
+	defer func() {
+		err = closeAll(e.result, e.countNullRes)
+	}()
+	hist, cms, err = e.buildStatsFromResult(e.result, true)
+	if err != nil {
+		return nil, nil, err
+	}
+	if e.countNullRes != nil {
+		nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
+		if err != nil {
+			return nil, nil, err
+		}
+		if l := nullHist.Len(); l > 0 {
+			hist.NullCount = nullHist.Buckets[l-1].Count
+		}
+	}
 	hist.ID = e.idxInfo.ID
 	return hist, cms, nil
 }

diff --git a/executor/analyze_test.go b/executor/analyze_test.go
@@ -55,7 +55,7 @@ PARTITION BY RANGE ( a ) (
 	for _, def := range pi.Definitions {
 		statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
 		c.Assert(statsTbl.Pseudo, IsFalse)
-		c.Assert(len(statsTbl.Columns), Equals, 2)
+		c.Assert(len(statsTbl.Columns), Equals, 3)
 		c.Assert(len(statsTbl.Indices), Equals, 1)
 		for _, col := range statsTbl.Columns {
 			c.Assert(col.Len(), Greater, 0)
@@ -81,7 +81,7 @@ PARTITION BY RANGE ( a ) (
 		statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
 		if i == 0 {
 			c.Assert(statsTbl.Pseudo, IsFalse)
-			c.Assert(len(statsTbl.Columns), Equals, 2)
+			c.Assert(len(statsTbl.Columns), Equals, 3)
 			c.Assert(len(statsTbl.Indices), Equals, 1)
 		} else {
 			c.Assert(statsTbl.Pseudo, IsTrue)

diff --git a/executor/show_stats_test.go b/executor/show_stats_test.go
@@ -109,5 +109,56 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
 	tk.MustExec("create table t (a int, index idx(a))")
 	tk.MustExec("insert into t values(NULL)")
 	tk.MustExec("analyze table t")
-	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL"))
+	// Null values are excluded from histogram for single-column index.
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows())
+	tk.MustExec("insert into t values(1)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
+		"test t a 0 0 1 1 1 1",
+		"test t idx 1 0 1 1 1 1",
+	))
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t (a int, b int, index idx(a, b))")
+	tk.MustExec("insert into t values(NULL, NULL)")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))
+
+	tk.MustExec("drop table t")
+	tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res := tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_b")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][6], Equals, "4")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index idx_c_a")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
+	c.Assert(len(res.Rows()), Equals, 1)
+	c.Assert(res.Rows()[0][6], Equals, "0")
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'")
+	c.Assert(len(res.Rows()), Equals, 0)
+	tk.MustExec("analyze table t index")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 2)
+	c.Assert(res.Rows()[0][6], Equals, "4")
+	c.Assert(res.Rows()[1][6], Equals, "0")
+	tk.MustExec("truncate table t")
+	tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
+	tk.MustExec("analyze table t")
+	res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
+	c.Assert(len(res.Rows()), Equals, 5)
+	c.Assert(res.Rows()[0][6], Equals, "1")
+	c.Assert(res.Rows()[1][6], Equals, "4")
+	c.Assert(res.Rows()[2][6], Equals, "1")
+	c.Assert(res.Rows()[3][6], Equals, "4")
+	c.Assert(res.Rows()[4][6], Equals, "0")
 }
diff --git a/planner/core/cbo_test.go b/planner/core/cbo_test.go
@@ -278,7 +278,7 @@ func (s *testAnalyzeSuite) TestIndexRead(c *C) {
 		},
 		{
 			sql:  "select count(*) from t where e > 1 group by b",
-			best: "IndexLookUp(Index(t.b)[[NULL,+inf]], Table(t)->Sel([gt(test.t.e, 1)]))->StreamAgg",
+			best: "TableReader(Table(t)->Sel([gt(test.t.e, 1)])->HashAgg)->HashAgg",
 		},
 		{
 			sql:  "select count(e) from t where t.b <= 20",
@@ -453,7 +453,7 @@ func (s *testAnalyzeSuite) TestAnalyze(c *C) {
 	}{
 		{
 			sql:  "analyze table t3",
-			best: "Analyze{Index(a),Table(b)}",
+			best: "Analyze{Index(a),Table(a, b)}",
 		},
 		// Test analyze full table.
 		{
@@ -676,11 +676,11 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
 			"  ├─TableReader_12 10.00 root data:TableScan_11",
 			"  │ └─TableScan_11 10.00 cop table:t, range:[-inf,+inf], keep order:false",
 			"  └─MaxOneRow_13 1.00 root ",
-			"    └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))",
-			"      └─IndexLookUp_21 0.80 root ",
-			"        ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
-			"        └─Selection_20 0.80 cop eq(t1.a, test.t.a)",
-			"          └─TableScan_19 1.25 cop table:t, keep order:false",
+			"    └─Projection_14 0.10 root concat(cast(t1.a), \",\", cast(t1.b))",
+			"      └─IndexLookUp_21 0.10 root ",
+			"        ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
+			"        └─Selection_20 0.10 cop eq(t1.a, test.t.a)",
+			"          └─TableScan_19 1.00 cop table:t, keep order:false",
 		))
 }
 

diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go
@@ -643,35 +643,16 @@ func (b *planBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
 // getColsInfo returns the info of index columns, normal columns and primary key.
 func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
 	tbl := tn.TableInfo
-	// idxNames contains all the normal columns that can be analyzed more effectively, because those columns occur as index
-	// columns or primary key columns with integer type.
-	var idxNames []string
-	if tbl.PKIsHandle {
-		for _, col := range tbl.Columns {
-			if mysql.HasPriKeyFlag(col.Flag) {
-				idxNames = append(idxNames, col.Name.L)
-				pkCol = col
-			}
+	for _, col := range tbl.Columns {
+		if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
+			pkCol = col
+		} else {
+			colsInfo = append(colsInfo, col)
 		}
 	}
 	for _, idx := range tn.TableInfo.Indices {
 		if idx.State == model.StatePublic {
 			indicesInfo = append(indicesInfo, idx)
-			if len(idx.Columns) == 1 {
-				idxNames = append(idxNames, idx.Columns[0].Name.L)
-			}
-		}
-	}
-	for _, col := range tbl.Columns {
-		isIndexCol := false
-		for _, idx := range idxNames {
-			if idx == col.Name.L {
-				isIndexCol = true
-				break
-			}
-		}
-		if !isIndexCol {
-			colsInfo = append(colsInfo, col)
 		}
 	}
 	return

diff --git a/statistics/feedback.go b/statistics/feedback.go
@@ -1073,6 +1073,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
 }
 
 func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCount float64) error {
+	lowIsNull := ran.LowVal[0].IsNull()
 	if q.tp == indexType {
 		sc := &stmtctx.StatementContext{TimeZone: time.UTC}
 		lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
@@ -1099,8 +1100,17 @@ func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCou
 	ranges := q.hist.SplitRange([]*ranger.Range{ran})
 	counts := make([]float64, 0, len(ranges))
 	sum := 0.0
-	for _, r := range ranges {
+	for i, r := range ranges {
+		// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
+		// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
+		// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
+		// its result of boundary values.
 		count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
+		// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
+		// does not include null values of lower bound.
+		if i == 0 && lowIsNull {
+			count += float64(q.hist.NullCount)
+		}
 		sum += count
 		counts = append(counts, count)
 	}