diff --git a/executor/analyze.go b/executor/analyze.go index 9c0e671d77270..12dde20fe0df1 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -24,6 +24,7 @@ import ( "sync/atomic" "time" + "github.com/cznic/mathutil" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/debugpb" "github.com/pingcap/parser/model" @@ -160,8 +161,8 @@ var errAnalyzeWorkerPanic = errors.New("analyze worker panic") func (e *AnalyzeExec) analyzeWorker(taskCh <-chan *analyzeTask, resultCh chan<- analyzeResult, isCloseChanThread bool) { defer func() { e.wg.Done() - e.wg.Wait() if isCloseChanThread { + e.wg.Wait() close(resultCh) } if r := recover(); r != nil { @@ -190,6 +191,8 @@ func (e *AnalyzeExec) analyzeWorker(taskCh <-chan *analyzeTask, resultCh chan<- task.job.Start() resultCh <- analyzeIndexPushdown(task.idxExec) case fastTask: + task.fastExec.job = task.job + task.job.Start() for _, result := range analyzeFastExec(task.fastExec) { resultCh <- result } @@ -507,7 +510,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms [] func analyzeFastExec(exec *AnalyzeFastExec) []analyzeResult { hists, cms, err := exec.buildStats() if err != nil { - return []analyzeResult{{Err: err}} + return []analyzeResult{{Err: err, job: exec.job}} } var results []analyzeResult hasPKInfo := 0 @@ -522,6 +525,7 @@ func analyzeFastExec(exec *AnalyzeFastExec) []analyzeResult { Cms: []*statistics.CMSketch{cms[i]}, IsIndex: 1, Count: hists[i].NullCount, + job: exec.job, } if hists[i].Len() > 0 { idxResult.Count += hists[i].Buckets[hists[i].Len()-1].Count @@ -535,6 +539,7 @@ func analyzeFastExec(exec *AnalyzeFastExec) []analyzeResult { Hist: hists[:hasPKInfo+len(exec.colsInfo)], Cms: cms[:hasPKInfo+len(exec.colsInfo)], Count: hist.NullCount, + job: exec.job, } if hist.Len() > 0 { colResult.Count += hist.Buckets[hist.Len()-1].Count @@ -560,6 +565,7 @@ type AnalyzeFastExec struct { idxsInfo []*model.IndexInfo concurrency int maxNumBuckets uint64 + tblInfo *model.TableInfo cache *tikv.RegionCache wg *sync.WaitGroup sampLocs chan *tikv.KeyLocation @@ -569,6 +575,7 @@ type AnalyzeFastExec struct { scanTasks []*tikv.KeyLocation collectors []*statistics.SampleCollector randSeed int64 + job *statistics.AnalyzeJob } func (e *AnalyzeFastExec) getSampRegionsRowCount(bo *tikv.Backoffer, needRebuild *bool, err *error, sampTasks *[]*AnalyzeFastTask) { @@ -627,12 +634,12 @@ func (e *AnalyzeFastExec) getSampRegionsRowCount(bo *tikv.Backoffer, needRebuild } } -// buildSampTask return tow variable, the first bool is whether the task meeting region error +// buildSampTask returns tow variables, the first bool is whether the task meets region error // and need to rebuild. func (e *AnalyzeFastExec) buildSampTask() (needRebuild bool, err error) { // Do get regions row count. bo := tikv.NewBackoffer(context.Background(), 500) - atomic.StoreUint64(&e.rowCount, 0) + e.rowCount = 0 needRebuildForRoutine := make([]bool, e.concurrency) errs := make([]error, e.concurrency) sampTasksForRoutine := make([][]*AnalyzeFastTask, e.concurrency) @@ -721,6 +728,11 @@ func (e *AnalyzeFastExec) updateCollectorSamples(sValue []byte, sKey kv.Key, sam if err != nil { return err } + var rowID int64 + rowID, err = tablecodec.DecodeRowKey(sKey) + if err != nil { + return err + } // Update the primary key collector. if hasPKInfo > 0 { v, ok := values[e.pkInfo.ID] @@ -735,7 +747,7 @@ func (e *AnalyzeFastExec) updateCollectorSamples(sValue []byte, sKey kv.Key, sam if e.collectors[0].Samples[samplePos] == nil { e.collectors[0].Samples[samplePos] = &statistics.SampleItem{} } - e.collectors[0].Samples[samplePos].Ordinal = int(samplePos) + e.collectors[0].Samples[samplePos].RowID = rowID e.collectors[0].Samples[samplePos].Value = v } // Update the columns' collectors. @@ -747,7 +759,7 @@ func (e *AnalyzeFastExec) updateCollectorSamples(sValue []byte, sKey kv.Key, sam if e.collectors[hasPKInfo+j].Samples[samplePos] == nil { e.collectors[hasPKInfo+j].Samples[samplePos] = &statistics.SampleItem{} } - e.collectors[hasPKInfo+j].Samples[samplePos].Ordinal = int(samplePos) + e.collectors[hasPKInfo+j].Samples[samplePos].RowID = rowID e.collectors[hasPKInfo+j].Samples[samplePos].Value = v } // Update the indexes' collectors. @@ -773,7 +785,7 @@ func (e *AnalyzeFastExec) updateCollectorSamples(sValue []byte, sKey kv.Key, sam if e.collectors[len(e.colsInfo)+hasPKInfo+j].Samples[samplePos] == nil { e.collectors[len(e.colsInfo)+hasPKInfo+j].Samples[samplePos] = &statistics.SampleItem{} } - e.collectors[len(e.colsInfo)+hasPKInfo+j].Samples[samplePos].Ordinal = int(samplePos) + e.collectors[len(e.colsInfo)+hasPKInfo+j].Samples[samplePos].RowID = rowID e.collectors[len(e.colsInfo)+hasPKInfo+j].Samples[samplePos].Value = types.NewBytesDatum(bytes) } return nil @@ -844,9 +856,7 @@ func (e *AnalyzeFastExec) handleScanTasks(bo *tikv.Backoffer) error { } func (e *AnalyzeFastExec) handleSampTasks(bo *tikv.Backoffer, workID int, err *error) { - defer func() { - e.wg.Done() - }() + defer e.wg.Done() var snapshot kv.Snapshot snapshot, *err = e.ctx.GetStore().(tikv.Storage).GetSnapshot(kv.MaxVersion) rander := rand.New(rand.NewSource(e.randSeed + int64(workID))) @@ -869,9 +879,6 @@ func (e *AnalyzeFastExec) handleSampTasks(bo *tikv.Backoffer, workID int, err *e if *err != nil { return } - if maxRowID <= minRowID { - continue - } keys := make([]kv.Key, 0, task.SampSize) for i := 0; i < int(task.SampSize); i++ { @@ -897,8 +904,37 @@ func (e *AnalyzeFastExec) handleSampTasks(bo *tikv.Backoffer, workID int, err *e } func (e *AnalyzeFastExec) buildHist(ID int64, collector *statistics.SampleCollector, tp *types.FieldType) (*statistics.Histogram, error) { - // TODO: build histogram and cmsketch here for one collector. - return nil, nil + // build collector properties. + collector.Samples = collector.Samples[:e.sampCursor] + sort.Slice(collector.Samples, func(i, j int) bool { return collector.Samples[i].RowID < collector.Samples[j].RowID }) + collector.CalcTotalSize() + data := make([][]byte, 0, len(collector.Samples)) + for i, sample := range collector.Samples { + sample.Ordinal = i + if sample.Value.IsNull() { + collector.NullCount++ + continue + } + bytes, err := tablecodec.EncodeValue(e.ctx.GetSessionVars().StmtCtx, sample.Value) + if err != nil { + return nil, err + } + data = append(data, bytes) + } + stats := domain.GetDomain(e.ctx).StatsHandle() + rowCount := int64(e.rowCount) + if stats.Lease > 0 { + rowCount = mathutil.MinInt64(stats.GetTableStats(e.tblInfo).Count, rowCount) + } + // build CMSketch + var ndv, scaleRatio uint64 + collector.CMSketch, ndv, scaleRatio = statistics.NewCMSketchWithTopN(defaultCMSketchDepth, defaultCMSketchWidth, data, 20, uint64(rowCount)) + // build Histogram + hist, err := statistics.BuildColumnHist(e.ctx, int64(e.maxNumBuckets), ID, collector, tp, rowCount, int64(ndv), collector.NullCount*int64(scaleRatio)) + if err != nil { + return nil, err + } + return hist, nil } func (e *AnalyzeFastExec) runTasks() ([]*statistics.Histogram, []*statistics.CMSketch, error) { @@ -974,6 +1010,8 @@ func (e *AnalyzeFastExec) buildStats() (hists []*statistics.Histogram, cms []*st return nil, nil, errors.Errorf(errMsg, maxBuildTimes) } + defer e.job.Update(int64(e.rowCount)) + // If total row count of the table is smaller than 2*MaxSampleSize, we // translate all the sample tasks to scan tasks. if e.rowCount < uint64(MaxSampleSize)*2 { @@ -1009,6 +1047,7 @@ type AnalyzeTestFastExec struct { IdxsInfo []*model.IndexInfo Concurrency int Collectors []*statistics.SampleCollector + TblInfo *model.TableInfo } // TestFastSample only test the fast sample in unit test. @@ -1020,6 +1059,8 @@ func (e *AnalyzeTestFastExec) TestFastSample() error { e.concurrency = e.Concurrency e.physicalTableID = e.PhysicalTableID e.wg = &sync.WaitGroup{} + e.job = &statistics.AnalyzeJob{} + e.tblInfo = e.TblInfo _, _, err := e.buildStats() e.Collectors = e.collectors return err diff --git a/executor/analyze_test.go b/executor/analyze_test.go index 1c9c8c7060649..e4407e88e2df3 100644 --- a/executor/analyze_test.go +++ b/executor/analyze_test.go @@ -183,6 +183,7 @@ func (s *testSuite1) TestAnalyzeFastSample(c *C) { IdxsInfo: indicesInfo, Concurrency: 1, PhysicalTableID: tbl.(table.PhysicalTable).GetPhysicalID(), + TblInfo: tblInfo, } err = mockExec.TestFastSample() c.Assert(err, IsNil) @@ -197,5 +198,87 @@ func (s *testSuite1) TestAnalyzeFastSample(c *C) { vals[i] = append(vals[i], s) } } - c.Assert(fmt.Sprintln(vals), Equals, "[[0 34 35 57 4 24 6 25 58 9 10 11 12 30 14 52 29 17 44 54] [0 34 35 57 4 24 6 25 58 9 10 11 12 30 14 52 29 17 44 54]]\n") + c.Assert(fmt.Sprintln(vals), Equals, "[[0 4 6 9 10 11 12 14 17 24 25 29 30 34 35 44 52 54 57 58] [0 4 6 9 10 11 12 14 17 24 25 29 30 34 35 44 52 54 57 58]]\n") +} + +func (s *testSuite1) TestFastAnalyze(c *C) { + cluster := mocktikv.NewCluster() + mocktikv.BootstrapWithSingleStore(cluster) + store, err := mockstore.NewMockTikvStore( + mockstore.WithCluster(cluster), + ) + c.Assert(err, IsNil) + var dom *domain.Domain + dom, err = session.BootstrapSession(store) + c.Assert(err, IsNil) + tk := testkit.NewTestKit(c, store) + executor.MaxSampleSize = 1000 + executor.RandSeed = 123 + + tk.MustExec("use test") + tk.MustExec("drop table if exists t") + tk.MustExec("create table t(a int primary key, b int, index index_b(b))") + tk.MustExec("set @@session.tidb_enable_fast_analyze=1") + tk.MustExec("set @@session.tidb_build_stats_concurrency=1") + for i := 0; i < 3000; i++ { + tk.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i)) + } + tblInfo, err := dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + tid := tblInfo.Meta().ID + + // construct 5 regions split by {600, 1200, 1800, 2400} + splitKeys := generateTableSplitKeyForInt(tid, []int{600, 1200, 1800, 2400}) + manipulateCluster(cluster, splitKeys) + + tk.MustExec("analyze table t with 5 buckets") + + is := executor.GetInfoSchema(tk.Se.(sessionctx.Context)) + table, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + tableInfo := table.Meta() + tbl := dom.StatsHandle().GetTableStats(tableInfo) + sTbl := fmt.Sprintln(tbl) + matched := false + if sTbl == "Table:37 Count:3000\n"+ + "column:1 ndv:3000 totColSize:0\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+ + "column:2 ndv:3000 totColSize:0\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+ + "index:1 ndv:3000\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n" || + sTbl == "Table:37 Count:3000\n"+ + "column:2 ndv:3000 totColSize:0\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+ + "column:1 ndv:3000 totColSize:0\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n"+ + "index:1 ndv:3000\n"+ + "num: 603 lower_bound: 6 upper_bound: 612 repeats: 1\n"+ + "num: 603 lower_bound: 621 upper_bound: 1205 repeats: 1\n"+ + "num: 603 lower_bound: 1207 upper_bound: 1830 repeats: 1\n"+ + "num: 603 lower_bound: 1831 upper_bound: 2387 repeats: 1\n"+ + "num: 588 lower_bound: 2390 upper_bound: 2997 repeats: 1\n" { + matched = true + } + c.Assert(matched, Equals, true) } diff --git a/executor/builder.go b/executor/builder.go index 55142e42995cf..11de4897155ac 100644 --- a/executor/builder.go +++ b/executor/builder.go @@ -1435,9 +1435,11 @@ func (b *executorBuilder) buildAnalyzeFastColumn(e *AnalyzeExec, task plannercor colsInfo: task.ColsInfo, pkInfo: task.PKInfo, maxNumBuckets: maxNumBuckets, + tblInfo: task.TblInfo, concurrency: concurrency, wg: &sync.WaitGroup{}, }, + job: &statistics.AnalyzeJob{DBName: task.DBName, TableName: task.TableName, PartitionName: task.PartitionName, JobInfo: "fast analyze columns"}, }) } } @@ -1464,9 +1466,11 @@ func (b *executorBuilder) buildAnalyzeFastIndex(e *AnalyzeExec, task plannercore physicalTableID: task.PhysicalTableID, idxsInfo: []*model.IndexInfo{task.IndexInfo}, maxNumBuckets: maxNumBuckets, + tblInfo: task.TblInfo, concurrency: concurrency, wg: &sync.WaitGroup{}, }, + job: &statistics.AnalyzeJob{DBName: task.DBName, TableName: task.TableName, PartitionName: "fast analyze index " + task.IndexInfo.Name.O}, }) } } diff --git a/planner/core/common_plans.go b/planner/core/common_plans.go index 933c48e4697cc..0c63590a7e507 100644 --- a/planner/core/common_plans.go +++ b/planner/core/common_plans.go @@ -427,20 +427,20 @@ type analyzeInfo struct { PartitionName string // PhysicalTableID is the id for a partition or a table. PhysicalTableID int64 - PKInfo *model.ColumnInfo - ColsInfo []*model.ColumnInfo } // AnalyzeColumnsTask is used for analyze columns. type AnalyzeColumnsTask struct { PKInfo *model.ColumnInfo ColsInfo []*model.ColumnInfo + TblInfo *model.TableInfo analyzeInfo } // AnalyzeIndexTask is used for analyze index. type AnalyzeIndexTask struct { IndexInfo *model.IndexInfo + TblInfo *model.TableInfo analyzeInfo } diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go index 64bef437ed633..2d231b07dadc4 100644 --- a/planner/core/planbuilder.go +++ b/planner/core/planbuilder.go @@ -812,6 +812,7 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt) (Plan, error) p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{ IndexInfo: idx, analyzeInfo: info, + TblInfo: tbl.TableInfo, }) } } @@ -822,6 +823,7 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt) (Plan, error) PKInfo: pkInfo, ColsInfo: colInfo, analyzeInfo: info, + TblInfo: tbl.TableInfo, }) } } @@ -843,7 +845,7 @@ func (b *PlanBuilder) buildAnalyzeIndex(as *ast.AnalyzeTableStmt) (Plan, error) } for i, id := range physicalIDs { info := analyzeInfo{DBName: as.TableNames[0].Schema.O, TableName: as.TableNames[0].Name.O, PartitionName: names[i], PhysicalTableID: id} - p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{IndexInfo: idx, analyzeInfo: info}) + p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{IndexInfo: idx, analyzeInfo: info, TblInfo: tblInfo}) } } return p, nil @@ -860,7 +862,7 @@ func (b *PlanBuilder) buildAnalyzeAllIndex(as *ast.AnalyzeTableStmt) (Plan, erro if idx.State == model.StatePublic { for i, id := range physicalIDs { info := analyzeInfo{DBName: as.TableNames[0].Schema.O, TableName: as.TableNames[0].Name.O, PartitionName: names[i], PhysicalTableID: id} - p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{IndexInfo: idx, analyzeInfo: info}) + p.IdxTasks = append(p.IdxTasks, AnalyzeIndexTask{IndexInfo: idx, analyzeInfo: info, TblInfo: tblInfo}) } } } diff --git a/statistics/builder.go b/statistics/builder.go index 9ede4ad58cb0b..c675656135557 100644 --- a/statistics/builder.go +++ b/statistics/builder.go @@ -93,15 +93,20 @@ func (b *SortedBuilder) Iterate(data types.Datum) error { return nil } -// BuildColumn builds histogram from samples for column. -func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType) (*Histogram, error) { - count := collector.Count - ndv := collector.FMSketch.NDV() +// BuildColumnHist build a histogram for a column. +// numBuckets: number of buckets for the histogram. +// id: the id of the table. +// collector: the collector of samples. +// tp: the FieldType for the column. +// count: represents the row count for the column. +// ndv: represents the number of distinct values for the column. +// nullCount: represents the number of null values for the column. +func BuildColumnHist(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType, count int64, ndv int64, nullCount int64) (*Histogram, error) { if ndv > count { ndv = count } if count == 0 || len(collector.Samples) == 0 { - return NewHistogram(id, ndv, collector.NullCount, 0, tp, 0, collector.TotalSize), nil + return NewHistogram(id, ndv, nullCount, 0, tp, 0, collector.TotalSize), nil } sc := ctx.GetSessionVars().StmtCtx samples := collector.Samples @@ -109,7 +114,7 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample if err != nil { return nil, err } - hg := NewHistogram(id, ndv, collector.NullCount, 0, tp, int(numBuckets), collector.TotalSize) + hg := NewHistogram(id, ndv, nullCount, 0, tp, int(numBuckets), collector.TotalSize) sampleNum := int64(len(samples)) // As we use samples to build the histogram, the bucket number and repeat should multiply a factor. @@ -174,3 +179,8 @@ func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *Sample hg.Correlation = (itemsCount*corrXYSum - corrXSum*corrXSum) / (itemsCount*corrX2Sum - corrXSum*corrXSum) return hg, nil } + +// BuildColumn builds histogram from samples for column. +func BuildColumn(ctx sessionctx.Context, numBuckets, id int64, collector *SampleCollector, tp *types.FieldType) (*Histogram, error) { + return BuildColumnHist(ctx, numBuckets, id, collector, tp, collector.Count, collector.FMSketch.NDV(), collector.NullCount) +} diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 6961d4733880e..e82ed041e1e11 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -109,8 +109,8 @@ func newTopNHelper(sample [][]byte, numTop uint32) *topNHelper { return &topNHelper{uint64(len(sample)), numTop, counter, sorted, onlyOnceItems, sumTopN, last} } -// NewCMSketchWithTopN returns a new CM sketch with TopN elements. -func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount uint64) *CMSketch { +// NewCMSketchWithTopN returns a new CM sketch with TopN elements, the estimate NDV and the scale ratio. +func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount uint64) (*CMSketch, uint64, uint64) { helper := newTopNHelper(sample, numTop) // rowCount is not a accurate value when fast analyzing // In some cases, if user triggers fast analyze when rowCount is close to sampleSize, unexpected bahavior might happen. @@ -118,7 +118,7 @@ func NewCMSketchWithTopN(d, w int32, sample [][]byte, numTop uint32, rowCount ui estimateNDV, scaleRatio := calculateEstimateNDV(helper, rowCount) c := buildCMSWithTopN(helper, d, w, scaleRatio) c.calculateDefaultVal(helper, estimateNDV, scaleRatio, rowCount) - return c + return c, estimateNDV, scaleRatio } func buildCMSWithTopN(helper *topNHelper, d, w int32, scaleRatio uint64) (c *CMSketch) { diff --git a/statistics/cmsketch_test.go b/statistics/cmsketch_test.go index f95e0b12e2fb4..084fe473f3adf 100644 --- a/statistics/cmsketch_test.go +++ b/statistics/cmsketch_test.go @@ -43,7 +43,8 @@ func prepareCMSWithTopN(d, w int32, vals []*types.Datum, n uint32, total uint64) } data = append(data, bytes) } - return NewCMSketchWithTopN(d, w, data, n, total), nil + cms, _, _ := NewCMSketchWithTopN(d, w, data, n, total) + return cms, nil } func buildCMSketchAndMap(d, w int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) { diff --git a/statistics/sample.go b/statistics/sample.go index 6805943046c8f..31b1f712b1719 100644 --- a/statistics/sample.go +++ b/statistics/sample.go @@ -36,6 +36,9 @@ type SampleItem struct { // Ordinal is original position of this item in SampleCollector before sorting. This // is used for computing correlation. Ordinal int + // RowID is the row id of the sample in its key. + // This property is used to calculate Ordinal in fast analyze. + RowID int64 } // SortSampleItems sorts a slice of SampleItem. @@ -173,6 +176,14 @@ func (c *SampleCollector) collect(sc *stmtctx.StatementContext, d types.Datum) e return nil } +// CalcTotalSize is to calculate total size based on samples. +func (c *SampleCollector) CalcTotalSize() { + c.TotalSize = 0 + for _, item := range c.Samples { + c.TotalSize += int64(len(item.Value.GetBytes())) + } +} + // SampleBuilder is used to build samples for columns. // Also, if primary key is handle, it will directly build histogram for it. type SampleBuilder struct {