From 2cea9949f20350930ba58366be5a28f606db4649 Mon Sep 17 00:00:00 2001 From: Weizhen Wang Date: Tue, 18 Jun 2024 12:26:46 +0800 Subject: [PATCH] statistics: stop loading too many stats when to init stats (#53999) close pingcap/tidb#54000 --- pkg/statistics/handle/BUILD.bazel | 2 + pkg/statistics/handle/bootstrap.go | 108 ++++++++++++++++++++--------- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/pkg/statistics/handle/BUILD.bazel b/pkg/statistics/handle/BUILD.bazel index 9b8d1bda85aa0..be127c465623e 100644 --- a/pkg/statistics/handle/BUILD.bazel +++ b/pkg/statistics/handle/BUILD.bazel @@ -17,6 +17,7 @@ go_library( "//pkg/parser/terror", "//pkg/sessionctx", "//pkg/sessionctx/sysproctrack", + "//pkg/sessionctx/variable", "//pkg/statistics", "//pkg/statistics/handle/autoanalyze", "//pkg/statistics/handle/cache", @@ -34,6 +35,7 @@ go_library( "//pkg/types", "//pkg/util/chunk", "//pkg/util/logutil", + "//pkg/util/memory", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@org_uber_go_zap//:zap", diff --git a/pkg/statistics/handle/bootstrap.go b/pkg/statistics/handle/bootstrap.go index a35122c1e5786..ad0202458f61a 100644 --- a/pkg/statistics/handle/bootstrap.go +++ b/pkg/statistics/handle/bootstrap.go @@ -28,6 +28,7 @@ import ( "github.com/pingcap/tidb/pkg/parser/mysql" "github.com/pingcap/tidb/pkg/parser/terror" "github.com/pingcap/tidb/pkg/sessionctx" + "github.com/pingcap/tidb/pkg/sessionctx/variable" "github.com/pingcap/tidb/pkg/statistics" "github.com/pingcap/tidb/pkg/statistics/handle/cache" "github.com/pingcap/tidb/pkg/statistics/handle/initstats" @@ -37,6 +38,7 @@ import ( "github.com/pingcap/tidb/pkg/types" "github.com/pingcap/tidb/pkg/util/chunk" "github.com/pingcap/tidb/pkg/util/logutil" + "github.com/pingcap/tidb/pkg/util/memory" "go.uber.org/zap" ) @@ -177,7 +179,7 @@ func (h *Handle) initStatsHistograms4ChunkLite(is infoschema.InfoSchema, cache s } } -func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache statstypes.StatsCache, iter *chunk.Iterator4Chunk) { +func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache statstypes.StatsCache, iter *chunk.Iterator4Chunk, isCacheFull bool) { var table *statistics.Table for row := iter.Begin(); row != iter.End(); row = iter.Next() { tblID, statsVer := row.GetInt64(0), row.GetInt64(8) @@ -210,10 +212,17 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache stats if idxInfo == nil { continue } - cms, topN, err := statistics.DecodeCMSketchAndTopN(row.GetBytes(6), nil) - if err != nil { - cms = nil - terror.Log(errors.Trace(err)) + + var cms *statistics.CMSketch + var topN *statistics.TopN + var err error + if !isCacheFull { + // stats cache is full. we should not put it into cache. but we must set LastAnalyzeVersion + cms, topN, err = statistics.DecodeCMSketchAndTopN(row.GetBytes(6), nil) + if err != nil { + cms = nil + terror.Log(errors.Trace(err)) + } } hist := statistics.NewHistogram(id, ndv, nullCount, version, types.NewFieldType(mysql.TypeBlob), chunk.InitialCapacity, 0) index := &statistics.Index{ @@ -226,7 +235,8 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache stats PhysicalID: tblID, } if statsVer != statistics.Version0 { - index.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + // We first set the StatsLoadedStatus as AllEvicted. when completing to load bucket, we will set it as ALlLoad. + index.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus() // The LastAnalyzeVersion is added by ALTER table so its value might be 0. table.LastAnalyzeVersion = max(table.LastAnalyzeVersion, version) } @@ -254,6 +264,8 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache stats Flag: row.GetInt64(10), StatsVer: statsVer, } + // primary key column has no stats info, because primary key's is_index is false. so it cannot load the topn + col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus() lastAnalyzePos.Copy(&col.LastAnalyzePos) table.Columns[hist.ID] = col table.ColAndIdxExistenceMap.InsertCol(colInfo.ID, colInfo, statsVer != statistics.Version0 || ndv > 0 || nullCount > 0) @@ -309,12 +321,12 @@ func (h *Handle) initStatsHistograms(is infoschema.InfoSchema, cache statstypes. if req.NumRows() == 0 { break } - h.initStatsHistograms4Chunk(is, cache, iter) + h.initStatsHistograms4Chunk(is, cache, iter, false) } return nil } -func (h *Handle) initStatsHistogramsByPaging(is infoschema.InfoSchema, cache statstypes.StatsCache, task initstats.Task) error { +func (h *Handle) initStatsHistogramsByPaging(is infoschema.InfoSchema, cache statstypes.StatsCache, task initstats.Task, totalMemory uint64) error { se, err := h.Pool.SPool().Get() if err != nil { return err @@ -324,6 +336,7 @@ func (h *Handle) initStatsHistogramsByPaging(is infoschema.InfoSchema, cache sta h.Pool.SPool().Put(se) } }() + sctx := se.(sessionctx.Context) // Why do we need to add `is_index=1` in the SQL? // because it is aligned to the `initStatsTopN` function, which only loads the topn of the index too. @@ -345,16 +358,16 @@ func (h *Handle) initStatsHistogramsByPaging(is infoschema.InfoSchema, cache sta if req.NumRows() == 0 { break } - h.initStatsHistograms4Chunk(is, cache, iter) + h.initStatsHistograms4Chunk(is, cache, iter, isFullCache(cache, totalMemory)) } return nil } -func (h *Handle) initStatsHistogramsConcurrency(is infoschema.InfoSchema, cache statstypes.StatsCache) error { +func (h *Handle) initStatsHistogramsConcurrency(is infoschema.InfoSchema, cache statstypes.StatsCache, totalMemory uint64) error { var maxTid = maxTidRecord.tid.Load() tid := int64(0) ls := initstats.NewRangeWorker("histogram", func(task initstats.Task) error { - return h.initStatsHistogramsByPaging(is, cache, task) + return h.initStatsHistogramsByPaging(is, cache, task, totalMemory) }, uint64(maxTid), uint64(initStatsStep)) ls.LoadStats() for tid <= maxTid { @@ -368,7 +381,10 @@ func (h *Handle) initStatsHistogramsConcurrency(is infoschema.InfoSchema, cache return nil } -func (*Handle) initStatsTopN4Chunk(cache statstypes.StatsCache, iter *chunk.Iterator4Chunk) { +func (*Handle) initStatsTopN4Chunk(cache statstypes.StatsCache, iter *chunk.Iterator4Chunk, totalMemory uint64) { + if isFullCache(cache, totalMemory) { + return + } affectedIndexes := make(map[*statistics.Index]struct{}) var table *statistics.Table for row := iter.Begin(); row != iter.End(); row = iter.Next() { @@ -404,7 +420,7 @@ func (*Handle) initStatsTopN4Chunk(cache statstypes.StatsCache, iter *chunk.Iter } } -func (h *Handle) initStatsTopN(cache statstypes.StatsCache) error { +func (h *Handle) initStatsTopN(cache statstypes.StatsCache, totalMemory uint64) error { sql := "select /*+ ORDER_INDEX(mysql.stats_top_n,tbl)*/ HIGH_PRIORITY table_id, hist_id, value, count from mysql.stats_top_n where is_index = 1 order by table_id" rc, err := util.Exec(h.initStatsCtx, sql) if err != nil { @@ -422,12 +438,12 @@ func (h *Handle) initStatsTopN(cache statstypes.StatsCache) error { if req.NumRows() == 0 { break } - h.initStatsTopN4Chunk(cache, iter) + h.initStatsTopN4Chunk(cache, iter, totalMemory) } return nil } -func (h *Handle) initStatsTopNByPaging(cache statstypes.StatsCache, task initstats.Task) error { +func (h *Handle) initStatsTopNByPaging(cache statstypes.StatsCache, task initstats.Task, totalMemory uint64) error { se, err := h.Pool.SPool().Get() if err != nil { return err @@ -455,19 +471,28 @@ func (h *Handle) initStatsTopNByPaging(cache statstypes.StatsCache, task initsta if req.NumRows() == 0 { break } - h.initStatsTopN4Chunk(cache, iter) + h.initStatsTopN4Chunk(cache, iter, totalMemory) } return nil } -func (h *Handle) initStatsTopNConcurrency(cache statstypes.StatsCache) error { +func (h *Handle) initStatsTopNConcurrency(cache statstypes.StatsCache, totalMemory uint64) error { + if isFullCache(cache, totalMemory) { + return nil + } var maxTid = maxTidRecord.tid.Load() tid := int64(0) ls := initstats.NewRangeWorker("TopN", func(task initstats.Task) error { - return h.initStatsTopNByPaging(cache, task) + if isFullCache(cache, totalMemory) { + return nil + } + return h.initStatsTopNByPaging(cache, task, totalMemory) }, uint64(maxTid), uint64(initStatsStep)) ls.LoadStats() for tid <= maxTid { + if isFullCache(cache, totalMemory) { + break + } ls.SendTask(initstats.Task{ StartTid: tid, EndTid: tid + initStatsStep, @@ -534,6 +559,9 @@ func (*Handle) initStatsBuckets4Chunk(cache statstypes.StatsCache, iter *chunk.I tableID, isIndex, histID := row.GetInt64(0), row.GetInt64(1), row.GetInt64(2) if table == nil || table.PhysicalID != tableID { if table != nil { + for _, index := range table.Indices { + index.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } cache.Put(table.PhysicalID, table) // put this table in the cache because all statstics of the table have been read. } var ok bool @@ -584,9 +612,12 @@ func (*Handle) initStatsBuckets4Chunk(cache statstypes.StatsCache, iter *chunk.I } } -func (h *Handle) initStatsBuckets(cache statstypes.StatsCache) error { +func (h *Handle) initStatsBuckets(cache statstypes.StatsCache, totalMemory uint64) error { + if isFullCache(cache, totalMemory) { + return nil + } if config.GetGlobalConfig().Performance.ConcurrentlyInitStats { - err := h.initStatsBucketsConcurrency(cache) + err := h.initStatsBucketsConcurrency(cache, totalMemory) if err != nil { return errors.Trace(err) } @@ -663,10 +694,16 @@ func (h *Handle) initStatsBucketsByPaging(cache statstypes.StatsCache, task init return nil } -func (h *Handle) initStatsBucketsConcurrency(cache statstypes.StatsCache) error { +func (h *Handle) initStatsBucketsConcurrency(cache statstypes.StatsCache, totalMemory uint64) error { + if isFullCache(cache, totalMemory) { + return nil + } var maxTid = maxTidRecord.tid.Load() tid := int64(0) ls := initstats.NewRangeWorker("bucket", func(task initstats.Task) error { + if isFullCache(cache, totalMemory) { + return nil + } return h.initStatsBucketsByPaging(cache, task) }, uint64(maxTid), uint64(initStatsStep)) ls.LoadStats() @@ -676,6 +713,9 @@ func (h *Handle) initStatsBucketsConcurrency(cache statstypes.StatsCache) error EndTid: tid + initStatsStep, }) tid += initStatsStep + if isFullCache(cache, totalMemory) { + break + } } ls.Wait() return nil @@ -715,6 +755,10 @@ func (h *Handle) InitStatsLite(is infoschema.InfoSchema) (err error) { // 1. Basic stats meta data is loaded.(count, modify count, etc.) // 2. Column/index stats are loaded. (histogram, topn, buckets, FMSketch) func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) { + totalMemory, err := memory.MemTotal() + if err != nil { + return err + } loadFMSketch := config.GetGlobalConfig().Performance.EnableLoadFMSketch defer func() { _, err1 := util.Exec(h.initStatsCtx, "commit") @@ -733,7 +777,7 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) { } statslogutil.StatsLogger().Info("complete to load the meta") if config.GetGlobalConfig().Performance.ConcurrentlyInitStats { - err = h.initStatsHistogramsConcurrency(is, cache) + err = h.initStatsHistogramsConcurrency(is, cache, totalMemory) } else { err = h.initStatsHistograms(is, cache) } @@ -742,9 +786,9 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) { return errors.Trace(err) } if config.GetGlobalConfig().Performance.ConcurrentlyInitStats { - err = h.initStatsTopNConcurrency(cache) + err = h.initStatsTopNConcurrency(cache, totalMemory) } else { - err = h.initStatsTopN(cache) + err = h.initStatsTopN(cache, totalMemory) } statslogutil.StatsLogger().Info("complete to load the topn") if err != nil { @@ -757,20 +801,16 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) { } statslogutil.StatsLogger().Info("complete to load the FM Sketch") } - err = h.initStatsBuckets(cache) + err = h.initStatsBuckets(cache, totalMemory) statslogutil.StatsLogger().Info("complete to load the bucket") if err != nil { return errors.Trace(err) } - // Set columns' stats status. - for _, table := range cache.Values() { - for _, col := range table.Columns { - if col.StatsAvailable() { - // primary key column has no stats info, because primary key's is_index is false. so it cannot load the topn - col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus() - } - } - } h.Replace(cache) return nil } + +func isFullCache(cache statstypes.StatsCache, total uint64) bool { + memQuota := variable.StatsCacheMemQuota.Load() + return (uint64(cache.MemConsumed()) >= total/4) || (cache.MemConsumed() >= memQuota && memQuota != 0) +}