From 60b29efa6b76c9ea32ee1b60b0b09bc85044ff31 Mon Sep 17 00:00:00 2001 From: Kenan Yao Date: Wed, 13 Mar 2019 18:09:46 +0800 Subject: [PATCH] statistics: set correlation when loading needed column histogram --- statistics/handle.go | 2 +- statistics/histogram.go | 23 ++++++++++++++++++++++- statistics/table.go | 22 +++------------------- statistics/update_test.go | 23 +++++++++++++++++++++++ 4 files changed, 49 insertions(+), 21 deletions(-) diff --git a/statistics/handle.go b/statistics/handle.go index 4b48aedc92edd..a2c15a33c5616 100644 --- a/statistics/handle.go +++ b/statistics/handle.go @@ -256,7 +256,7 @@ func (h *Handle) LoadNeededHistograms() error { histogramNeededColumns.delete(col) continue } - hg, err := h.histogramFromStorage(col.tableID, c.ID, &c.Info.FieldType, c.NDV, 0, c.LastUpdateVersion, c.NullCount, c.TotColSize) + hg, err := h.histogramFromStorage(col.tableID, c.ID, &c.Info.FieldType, c.NDV, 0, c.LastUpdateVersion, c.NullCount, c.TotColSize, c.Correlation) if err != nil { return errors.Trace(err) } diff --git a/statistics/histogram.go b/statistics/histogram.go index 9d8263ae5cbc4..052ca6ccab340 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -301,7 +301,7 @@ func (h *Handle) SaveMetaToStorage(tableID, count, modifyCount int64) (err error return } -func (h *Handle) histogramFromStorage(tableID int64, colID int64, tp *types.FieldType, distinct int64, isIndex int, ver uint64, nullCount int64, totColSize int64) (*Histogram, error) { +func (h *Handle) histogramFromStorage(tableID int64, colID int64, tp *types.FieldType, distinct int64, isIndex int, ver uint64, nullCount int64, totColSize int64, corr float64) (*Histogram, error) { selSQL := fmt.Sprintf("select count, repeats, lower_bound, upper_bound from mysql.stats_buckets where table_id = %d and is_index = %d and hist_id = %d order by bucket_id", tableID, isIndex, colID) rows, fields, err := h.restrictedExec.ExecRestrictedSQL(nil, selSQL) if err != nil { @@ -309,6 +309,7 @@ func (h *Handle) histogramFromStorage(tableID int64, colID int64, tp *types.Fiel } bucketSize := len(rows) hg := NewHistogram(colID, distinct, nullCount, ver, tp, bucketSize, totColSize) + hg.Correlation = corr totalCount := int64(0) for i := 0; i < bucketSize; i++ { count := rows[i].GetInt64(0) @@ -759,6 +760,21 @@ func (c *Column) String() string { return c.Histogram.ToString(0) } +var histogramNeededColumns = neededColumnMap{cols: map[tableColumnID]struct{}{}} + +// IsInvalid checks if this column is invalid. If this column has histogram but not loaded yet, then we mark it +// as need histogram. +func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { + if collPseudo && c.NotAccurate() { + return true + } + if c.NDV > 0 && c.Len() == 0 && sc != nil { + sc.SetHistogramsNotLoad() + histogramNeededColumns.insert(tableColumnID{tableID: c.PhysicalID, columnID: c.Info.ID}) + } + return c.totalRowCount() == 0 || (c.NDV > 0 && c.Len() == 0) +} + func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, modifyCount int64) (float64, error) { if val.IsNull() { return float64(c.NullCount), nil @@ -839,6 +855,11 @@ func (idx *Index) String() string { return idx.Histogram.ToString(len(idx.Info.Columns)) } +// IsInvalid checks if this index is invalid. +func (idx *Index) IsInvalid(collPseudo bool) bool { + return (collPseudo && idx.NotAccurate()) || idx.totalRowCount() == 0 +} + func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCount int64) float64 { val := types.NewBytesDatum(b) if idx.NDV > 0 && idx.outOfRange(val) { diff --git a/statistics/table.go b/statistics/table.go index a87a34eba9e06..8ac5926cc878f 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -122,7 +122,7 @@ func (h *Handle) indexStatsFromStorage(row chunk.Row, table *Table, tableInfo *m continue } if idx == nil || idx.LastUpdateVersion < histVer { - hg, err := h.histogramFromStorage(table.PhysicalID, histID, types.NewFieldType(mysql.TypeBlob), distinct, 1, histVer, nullCount, 0) + hg, err := h.histogramFromStorage(table.PhysicalID, histID, types.NewFieldType(mysql.TypeBlob), distinct, 1, histVer, nullCount, 0, 0) if err != nil { return errors.Trace(err) } @@ -189,11 +189,10 @@ func (h *Handle) columnStatsFromStorage(row chunk.Row, table *Table, tableInfo * break } if col == nil || col.LastUpdateVersion < histVer || loadAll { - hg, err := h.histogramFromStorage(table.PhysicalID, histID, &colInfo.FieldType, distinct, 0, histVer, nullCount, totColSize) + hg, err := h.histogramFromStorage(table.PhysicalID, histID, &colInfo.FieldType, distinct, 0, histVer, nullCount, totColSize, correlation) if err != nil { return errors.Trace(err) } - hg.Correlation = correlation cms, err := h.cmSketchFromStorage(table.PhysicalID, 0, colInfo.ID) if err != nil { return errors.Trace(err) @@ -333,8 +332,6 @@ func (n *neededColumnMap) delete(col tableColumnID) { n.m.Unlock() } -var histogramNeededColumns = neededColumnMap{cols: map[tableColumnID]struct{}{}} - // RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid // and use pseudo estimation. var RatioOfPseudoEstimate = 0.7 @@ -347,19 +344,6 @@ func (t *Table) IsOutdated() bool { return false } -// IsInvalid checks if this column is invalid. If this column has histogram but not loaded yet, then we mark it -// as need histogram. -func (c *Column) IsInvalid(sc *stmtctx.StatementContext, collPseudo bool) bool { - if collPseudo && c.NotAccurate() { - return true - } - if c.NDV > 0 && c.Len() == 0 && sc != nil { - sc.SetHistogramsNotLoad() - histogramNeededColumns.insert(tableColumnID{tableID: c.PhysicalID, columnID: c.Info.ID}) - } - return c.totalRowCount() == 0 || (c.NDV > 0 && c.Len() == 0) -} - // ColumnGreaterRowCount estimates the row count where the column greater than value. func (t *Table) ColumnGreaterRowCount(sc *stmtctx.StatementContext, value types.Datum, colID int64) float64 { c, ok := t.Columns[colID] @@ -429,7 +413,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, co // GetRowCountByIndexRanges estimates the row count by a slice of Range. func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idxID int64, indexRanges []*ranger.Range) (float64, error) { idx := coll.Indices[idxID] - if idx == nil || coll.Pseudo && idx.NotAccurate() || idx.Len() == 0 { + if idx == nil || idx.IsInvalid(coll.Pseudo) { colsLen := -1 if idx != nil && idx.Info.Unique { colsLen = len(idx.Info.Columns) diff --git a/statistics/update_test.go b/statistics/update_test.go index 4f8e20ef20d9a..d6c9b4e4812b5 100644 --- a/statistics/update_test.go +++ b/statistics/update_test.go @@ -1493,3 +1493,26 @@ func (s *testStatsSuite) TestUnsignedFeedbackRanges(c *C) { c.Assert(tbl.Columns[1].ToString(0), Equals, tests[i].hist) } } + +func (s *testStatsSuite) TestLoadHistCorrelation(c *C) { + defer cleanEnv(c, s.store, s.do) + testKit := testkit.NewTestKit(c, s.store) + h := s.do.StatsHandle() + origLease := h.Lease + h.Lease = time.Second + defer func() { h.Lease = origLease }() + testKit.MustExec("use test") + testKit.MustExec("create table t(c int)") + testKit.MustExec("insert into t values(1),(2),(3),(4),(5)") + c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil) + testKit.MustExec("analyze table t") + h.Clear() + c.Assert(h.Update(s.do.InfoSchema()), IsNil) + result := testKit.MustQuery("show stats_histograms where Table_name = 't'") + c.Assert(len(result.Rows()), Equals, 0) + testKit.MustExec("explain select * from t where c = 1") + c.Assert(h.LoadNeededHistograms(), IsNil) + result = testKit.MustQuery("show stats_histograms where Table_name = 't'") + c.Assert(len(result.Rows()), Equals, 1) + c.Assert(result.Rows()[0][9], Equals, "1") +}