From a7f78c77041395ac20ecc85224e94dcaebbe38c6 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Wed, 17 Oct 2018 14:45:50 +0800 Subject: [PATCH] stats: fix histogram boundaries overflow error (#7883) --- statistics/feedback.go | 104 +++++++++++++++++++++++++++++------- statistics/feedback_test.go | 2 +- statistics/update.go | 4 -- statistics/update_test.go | 71 +++++++++++++++++++++++- types/etc_test.go | 2 +- types/helper.go | 5 +- 6 files changed, 159 insertions(+), 29 deletions(-) diff --git a/statistics/feedback.go b/statistics/feedback.go index 6477aa57e42ad..81eccc6787e88 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -243,12 +243,71 @@ type BucketFeedback struct { upper *types.Datum // The upper bound of the new bucket. } +// outOfRange checks if the `val` is between `min` and `max`. +func outOfRange(sc *stmtctx.StatementContext, min, max, val *types.Datum) (int, error) { + result, err := val.CompareDatum(sc, min) + if err != nil { + return 0, err + } + if result < 0 { + return result, nil + } + result, err = val.CompareDatum(sc, max) + if err != nil { + return 0, err + } + if result > 0 { + return result, nil + } + return 0, nil +} + +// adjustFeedbackBoundaries adjust the feedback boundaries according to the `min` and `max`. +// If the feedback has no intersection with `min` and `max`, we could just skip this feedback. +func (f *feedback) adjustFeedbackBoundaries(sc *stmtctx.StatementContext, min, max *types.Datum) (bool, error) { + result, err := outOfRange(sc, min, max, f.lower) + if err != nil { + return false, err + } + if result > 0 { + return true, nil + } + if result < 0 { + f.lower = min + } + result, err = outOfRange(sc, min, max, f.upper) + if err != nil { + return false, err + } + if result < 0 { + return true, nil + } + if result > 0 { + f.upper = max + } + return false, nil +} + // buildBucketFeedback build the feedback for each bucket from the histogram feedback. func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*BucketFeedback, int) { bktID2FB := make(map[int]*BucketFeedback) + if len(feedback.feedback) == 0 { + return bktID2FB, 0 + } total := 0 - for _, ran := range feedback.feedback { - idx, _ := h.Bounds.LowerBound(0, ran.lower) + sc := &stmtctx.StatementContext{TimeZone: time.UTC} + kind := feedback.feedback[0].lower.Kind() + min, max := getMinValue(kind, h.tp), getMaxValue(kind, h.tp) + for _, fb := range feedback.feedback { + skip, err := fb.adjustFeedbackBoundaries(sc, &min, &max) + if err != nil { + log.Debugf("adjust feedback boundaries failed, err: %v", errors.ErrorStack(err)) + continue + } + if skip { + continue + } + idx, _ := h.Bounds.LowerBound(0, fb.lower) bktIdx := 0 // The last bucket also stores the feedback that falls outside the upper bound. if idx >= h.Bounds.NumRows()-2 { @@ -256,7 +315,7 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket } else { bktIdx = idx / 2 // Make sure that this feedback lies within the bucket. - if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, ran.upper) < 0 { + if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, fb.upper) < 0 { continue } } @@ -266,23 +325,23 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket bkt = &BucketFeedback{lower: h.GetLower(bktIdx), upper: h.GetUpper(bktIdx)} bktID2FB[bktIdx] = bkt } - bkt.feedback = append(bkt.feedback, ran) + bkt.feedback = append(bkt.feedback, fb) // Update the bound if necessary. - res, err := bkt.lower.CompareDatum(nil, ran.lower) + res, err := bkt.lower.CompareDatum(nil, fb.lower) if err != nil { - log.Debugf("compare datum %v with %v failed, err: %v", bkt.lower, ran.lower, errors.ErrorStack(err)) + log.Debugf("compare datum %v with %v failed, err: %v", bkt.lower, fb.lower, errors.ErrorStack(err)) continue } if res > 0 { - bkt.lower = ran.lower + bkt.lower = fb.lower } - res, err = bkt.upper.CompareDatum(nil, ran.upper) + res, err = bkt.upper.CompareDatum(nil, fb.upper) if err != nil { - log.Debugf("compare datum %v with %v failed, err: %v", bkt.upper, ran.upper, errors.ErrorStack(err)) + log.Debugf("compare datum %v with %v failed, err: %v", bkt.upper, fb.upper, errors.ErrorStack(err)) continue } if res < 0 { - bkt.upper = ran.upper + bkt.upper = fb.upper } } return bktID2FB, total @@ -528,7 +587,12 @@ func splitBuckets(h *Histogram, feedback *QueryFeedback) ([]bucket, []bool, int6 func UpdateHistogram(h *Histogram, feedback *QueryFeedback) *Histogram { buckets, isNewBuckets, totalCount := splitBuckets(h, feedback) buckets = mergeBuckets(buckets, isNewBuckets, float64(totalCount)) - return buildNewHistogram(h, buckets) + hist := buildNewHistogram(h, buckets) + // Update the NDV of primary key column. + if feedback.tp == pkType { + hist.NDV = int64(hist.totalRowCount()) + } + return hist } // UpdateCMSketch updates the CMSketch by feedback. @@ -1077,13 +1141,13 @@ func supportColumnType(k byte) bool { func getMaxValue(k byte, ft *types.FieldType) (max types.Datum) { switch k { case types.KindInt64: - max.SetInt64(math.MaxInt64) + max.SetInt64(types.SignedUpperBound[ft.Tp]) case types.KindUint64: - max.SetUint64(math.MaxUint64) + max.SetUint64(types.UnsignedUpperBound[ft.Tp]) case types.KindFloat32: - max.SetFloat32(math.MaxFloat32) + max.SetFloat32(float32(types.GetMaxFloat(ft.Flen, ft.Decimal))) case types.KindFloat64: - max.SetFloat64(math.MaxFloat64) + max.SetFloat64(types.GetMaxFloat(ft.Flen, ft.Decimal)) case types.KindString, types.KindBytes: val := types.MaxValueDatum() bytes, err := codec.EncodeKey(nil, nil, val) @@ -1093,7 +1157,7 @@ func getMaxValue(k byte, ft *types.FieldType) (max types.Datum) { } max.SetBytes(bytes) case types.KindMysqlDecimal: - max.SetMysqlDecimal(types.NewMaxOrMinDec(false, mysql.MaxDecimalWidth, 0)) + max.SetMysqlDecimal(types.NewMaxOrMinDec(false, ft.Flen, ft.Decimal)) case types.KindMysqlDuration: max.SetMysqlDuration(types.Duration{Duration: math.MaxInt64}) case types.KindMysqlTime: @@ -1109,13 +1173,13 @@ func getMaxValue(k byte, ft *types.FieldType) (max types.Datum) { func getMinValue(k byte, ft *types.FieldType) (min types.Datum) { switch k { case types.KindInt64: - min.SetInt64(math.MinInt64) + min.SetInt64(types.SignedLowerBound[ft.Tp]) case types.KindUint64: min.SetUint64(0) case types.KindFloat32: - min.SetFloat32(-math.MaxFloat32) + min.SetFloat32(float32(-types.GetMaxFloat(ft.Flen, ft.Decimal))) case types.KindFloat64: - min.SetFloat64(-math.MaxFloat64) + min.SetFloat64(-types.GetMaxFloat(ft.Flen, ft.Decimal)) case types.KindString, types.KindBytes: val := types.MinNotNullDatum() bytes, err := codec.EncodeKey(nil, nil, val) @@ -1125,7 +1189,7 @@ func getMinValue(k byte, ft *types.FieldType) (min types.Datum) { } min.SetBytes(bytes) case types.KindMysqlDecimal: - min.SetMysqlDecimal(types.NewMaxOrMinDec(true, mysql.MaxDecimalWidth, 0)) + min.SetMysqlDecimal(types.NewMaxOrMinDec(true, ft.Flen, ft.Decimal)) case types.KindMysqlDuration: min.SetMysqlDuration(types.Duration{Duration: math.MinInt64}) case types.KindMysqlTime: diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go index 9b0907a2b6a0e..f5b83b44ee663 100644 --- a/statistics/feedback_test.go +++ b/statistics/feedback_test.go @@ -70,7 +70,7 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) { defaultBucketCount = 7 defer func() { defaultBucketCount = originBucketCount }() c.Assert(UpdateHistogram(q.Hist(), q).ToString(0), Equals, - "column:0 ndv:0 totColSize:0\n"+ + "column:0 ndv:10057 totColSize:0\n"+ "num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+ "num: 8 lower_bound: 2 upper_bound: 7 repeats: 0\n"+ "num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+ diff --git a/statistics/update.go b/statistics/update.go index 8c80baec91585..b6e1f7b099eb1 100644 --- a/statistics/update.go +++ b/statistics/update.go @@ -558,10 +558,6 @@ func (h *Handle) handleSingleHistogramUpdate(is infoschema.InfoSchema, rows []ch log.Debugf("decode feedback failed, err: %v", errors.ErrorStack(err)) } } - // Update the NDV of primary key column. - if table.Meta().PKIsHandle && q.tp == pkType { - hist.NDV = int64(hist.totalRowCount()) - } err = h.dumpStatsUpdateToKV(physicalTableID, isIndex, q, hist, cms) return errors.Trace(err) } diff --git a/statistics/update_test.go b/statistics/update_test.go index fd8af3c44966f..ab3b69cb83603 100644 --- a/statistics/update_test.go +++ b/statistics/update_test.go @@ -632,7 +632,7 @@ func (s *testStatsUpdateSuite) TestQueryFeedback(c *C) { { // test primary key feedback sql: "select * from t where t.a <= 5", - hist: "column:1 ndv:3 totColSize:0\n" + + hist: "column:1 ndv:4 totColSize:0\n" + "num: 1 lower_bound: -9223372036854775808 upper_bound: 1 repeats: 0\n" + "num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n" + "num: 2 lower_bound: 3 upper_bound: 5 repeats: 0", @@ -1156,3 +1156,72 @@ func (s *testStatsUpdateSuite) TestIndexQueryFeedback(c *C) { c.Assert(tbl.Indices[t.idxID].CMSketch.QueryBytes(val), Equals, t.eqCount) } } + +func (s *testStatsUpdateSuite) TestFeedbackRanges(c *C) { + defer cleanEnv(c, s.store, s.do) + testKit := testkit.NewTestKit(c, s.store) + h := s.do.StatsHandle() + oriProbability := statistics.FeedbackProbability + oriNumber := statistics.MaxNumberOfRanges + defer func() { + statistics.FeedbackProbability = oriProbability + statistics.MaxNumberOfRanges = oriNumber + }() + statistics.FeedbackProbability = 1 + + testKit.MustExec("use test") + testKit.MustExec("create table t (a tinyint, b tinyint, primary key(a), index idx(a, b))") + for i := 0; i < 20; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i)) + } + h.HandleDDLEvent(<-h.DDLEventCh()) + c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil) + testKit.MustExec("analyze table t with 3 buckets") + for i := 30; i < 40; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i)) + } + c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil) + tests := []struct { + sql string + hist string + colID int64 + }{ + { + sql: "select * from t where a <= 50 or (a > 130 and a < 140)", + hist: "column:1 ndv:30 totColSize:0\n" + + "num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 14 lower_bound: 16 upper_bound: 50 repeats: 0", + colID: 1, + }, + { + sql: "select * from t where a >= 10", + hist: "column:1 ndv:30 totColSize:0\n" + + "num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 14 lower_bound: 16 upper_bound: 127 repeats: 0", + colID: 1, + }, + { + sql: "select * from t use index(idx) where a = 1 and (b <= 50 or (b > 130 and b < 140))", + hist: "column:2 ndv:20 totColSize:20\n" + + "num: 7 lower_bound: -128 upper_bound: 6 repeats: 0\n" + + "num: 7 lower_bound: 7 upper_bound: 13 repeats: 1\n" + + "num: 6 lower_bound: 14 upper_bound: 19 repeats: 1", + colID: 2, + }, + } + is := s.do.InfoSchema() + table, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + for i, t := range tests { + testKit.MustQuery(t.sql) + c.Assert(h.DumpStatsDeltaToKV(statistics.DumpAll), IsNil) + c.Assert(h.DumpStatsFeedbackToKV(), IsNil) + c.Assert(h.HandleUpdateStats(s.do.InfoSchema()), IsNil) + c.Assert(err, IsNil) + h.Update(is) + tblInfo := table.Meta() + tbl := h.GetTableStats(tblInfo) + c.Assert(tbl.Columns[t.colID].ToString(0), Equals, tests[i].hist) + } +} diff --git a/types/etc_test.go b/types/etc_test.go index 4a5e161f2e32e..60d272bff2866 100644 --- a/types/etc_test.go +++ b/types/etc_test.go @@ -122,7 +122,7 @@ func (s *testTypeEtcSuite) TestMaxFloat(c *C) { } for _, t := range tbl { - f := getMaxFloat(t.Flen, t.Decimal) + f := GetMaxFloat(t.Flen, t.Decimal) c.Assert(f, Equals, t.Expect) } } diff --git a/types/helper.go b/types/helper.go index 385921f479b9d..ec95d824d2f1e 100644 --- a/types/helper.go +++ b/types/helper.go @@ -59,7 +59,8 @@ func Truncate(f float64, dec int) float64 { return math.Trunc(tmp) / shift } -func getMaxFloat(flen int, decimal int) float64 { +// GetMaxFloat gets the max float for given flen and decimal. +func GetMaxFloat(flen int, decimal int) float64 { intPartLen := flen - decimal f := math.Pow10(intPartLen) f -= math.Pow10(-decimal) @@ -74,7 +75,7 @@ func TruncateFloat(f float64, flen int, decimal int) (float64, error) { return 0, ErrOverflow.GenWithStackByArgs("DOUBLE", "") } - maxF := getMaxFloat(flen, decimal) + maxF := GetMaxFloat(flen, decimal) if !math.IsInf(f, 0) { f = Round(f, decimal)