diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result new file mode 100644 index 0000000000000..aaeb66a8d9b44 --- /dev/null +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -0,0 +1,23 @@ +set @@sql_mode = 'strict_trans_tables'; +drop table if exists t; +create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8); +insert into t values ('中文', 'asdf', '字符集'); +insert into t values ('À', 'ø', '😂'); +Error 1366: Incorrect string value '\xC3\x80' for column 'a' +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); +Error 1366: Incorrect string value '\xFF\xFF' for column 'a' +select * from t; +a b c +中文 asdf 字符集 +set @@sql_mode = ''; +insert into t values ('À', 'ø', '😂'); +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); +select * from t; +a b c +中文 asdf 字符集 +? ? ? +中文?中文 asdf?fdsa 字符集?字符集 +@@ @@ @@ diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test new file mode 100644 index 0000000000000..34031d0b83ef8 --- /dev/null +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -0,0 +1,17 @@ +set @@sql_mode = 'strict_trans_tables'; +drop table if exists t; +create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8); +insert into t values ('中文', 'asdf', '字符集'); +-- error 1366: Incorrect string value '\xC3\x80' for column 'a' +insert into t values ('À', 'ø', '😂'); +-- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +-- error 1366: Incorrect string value '\xFF\xFF' for column 'a' +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); +select * from t; + +set @@sql_mode = ''; +insert into t values ('À', 'ø', '😂'); +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); +select * from t; diff --git a/executor/insert_test.go b/executor/insert_test.go index e2935f12735a5..ae2af1d0a8ddb 100644 --- a/executor/insert_test.go +++ b/executor/insert_test.go @@ -331,7 +331,7 @@ func (s *testSuite3) TestInsertWrongValueForField(c *C) { tk.MustExec(`create table t1(a char(10) charset utf8);`) tk.MustExec(`insert into t1 values('我');`) tk.MustExec(`alter table t1 add column b char(10) charset ascii as ((a));`) - tk.MustQuery(`select * from t1;`).Check(testkit.Rows(`我 `)) + tk.MustQuery(`select * from t1;`).Check(testkit.Rows("我 ?")) tk.MustExec(`drop table if exists t;`) tk.MustExec(`create table t (a year);`) diff --git a/expression/collation.go b/expression/collation.go index 7db5645941601..7312e8f4c8413 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -327,12 +327,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) func isValidString(str string, dstChs string) bool { switch dstChs { case charset.CharsetASCII: - for _, c := range str { - if c >= 0x80 { - return false - } - } - return true + return charset.StringValidatorASCII{}.Validate(str) == -1 case charset.CharsetLatin1: // For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin; return true @@ -343,9 +338,7 @@ func isValidString(str string, dstChs string) bool { // Convert to binary is always safe. return true default: - e, _ := charset.Lookup(dstChs) - _, err := e.NewEncoder().String(str) - return err == nil + return charset.StringValidatorOther{Charset: dstChs}.Validate(str) == -1 } } diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index ea7e6d8915798..b1e1f1c293e4b 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -15,7 +15,10 @@ package charset import ( "strings" + go_unicode "unicode" + "unicode/utf8" + "github.com/cznic/mathutil" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/japanese" @@ -273,11 +276,10 @@ func FindNextCharacterLength(label string) func([]byte) int { var encodingNextCharacterLength = map[string]func([]byte) int{ // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram - "gbk": characterLengthGBK, - "utf-8": characterLengthUTF8, - "binary": func(bs []byte) int { - return 1 - }, + "gbk": characterLengthGBK, + "utf-8": characterLengthUTF8, + "binary": characterLengthOne, + "windows-1252": characterLengthOne, } func characterLengthGBK(bs []byte) int { @@ -298,3 +300,183 @@ func characterLengthUTF8(bs []byte) int { } return 4 } + +func characterLengthOne(_ []byte) int { + return 1 +} + +// TruncateStrategy indicates the way to handle the invalid strings in specific charset. +// - TruncateStrategyEmpty: returns an empty string. +// - TruncateStrategyTrim: returns the valid prefix part of string. +// - TruncateStrategyReplace: returns the whole string, but the invalid characters are replaced with '?'. +type TruncateStrategy int8 + +const ( + TruncateStrategyEmpty TruncateStrategy = iota + TruncateStrategyTrim + TruncateStrategyReplace +) + +var _ StringValidator = StringValidatorASCII{} +var _ StringValidator = StringValidatorUTF8{} +var _ StringValidator = StringValidatorOther{} + +// StringValidator is used to check if a string is valid in the specific charset. +type StringValidator interface { + Validate(str string) (invalidPos int) + Truncate(str string, strategy TruncateStrategy) (result string, invalidPos int) +} + +// StringValidatorASCII checks whether a string is valid ASCII string. +type StringValidatorASCII struct{} + +// Validate checks whether the string is valid in the given charset. +func (s StringValidatorASCII) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos +} + +// Truncate implement the interface StringValidator. +func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (string, int) { + invalidPos := -1 + for i := 0; i < len(str); i++ { + if str[i] > go_unicode.MaxASCII { + invalidPos = i + break + } + } + if invalidPos == -1 { + // Quick check passed. + return str, -1 + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:invalidPos], invalidPos + case TruncateStrategyReplace: + result := make([]byte, 0, len(str)) + for i, w := 0, 0; i < len(str); i += w { + w = 1 + if str[i] > go_unicode.MaxASCII { + w = characterLengthUTF8(Slice(str)[i:]) + w = mathutil.Min(w, len(str)-i) + result = append(result, '?') + continue + } + result = append(result, str[i:i+w]...) + } + return string(result), invalidPos + } + return str, -1 +} + +// StringValidatorUTF8 checks whether a string is valid UTF8 string. +type StringValidatorUTF8 struct { + IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" + CheckMB4ValueInUTF8 bool +} + +// Validate checks whether the string is valid in the given charset. +func (s StringValidatorUTF8) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos +} + +// Truncate implement the interface StringValidator. +func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (string, int) { + if str == "" { + return str, -1 + } + if s.IsUTF8MB4 && utf8.ValidString(str) { + // Quick check passed. + return str, -1 + } + doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8 + var result []byte + if strategy == TruncateStrategyReplace { + result = make([]byte, 0, len(str)) + } + invalidPos := -1 + for i, w := 0, 0; i < len(str); i += w { + var rv rune + rv, w = utf8.DecodeRuneInString(str[i:]) + if (rv == utf8.RuneError && w == 1) || (w > 3 && doMB4CharCheck) { + if invalidPos == -1 { + invalidPos = i + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:i], invalidPos + case TruncateStrategyReplace: + result = append(result, '?') + continue + } + } + if strategy == TruncateStrategyReplace { + result = append(result, str[i:i+w]...) + } + } + if strategy == TruncateStrategyReplace { + return string(result), invalidPos + } + return str, -1 +} + +// StringValidatorOther checks whether a string is valid string in given charset. +type StringValidatorOther struct { + Charset string +} + +// Validate checks whether the string is valid in the given charset. +func (s StringValidatorOther) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos +} + +// Truncate implement the interface StringValidator. +func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (string, int) { + if str == "" { + return str, -1 + } + enc := NewEncoding(s.Charset) + if !enc.enabled() { + return str, -1 + } + var result []byte + if strategy == TruncateStrategyReplace { + result = make([]byte, 0, len(str)) + } + var buf [4]byte + strBytes := Slice(str) + transformer := enc.enc.NewEncoder() + invalidPos := -1 + for i, w := 0, 0; i < len(str); i += w { + w = characterLengthUTF8(strBytes[i:]) + w = mathutil.Min(w, len(str)-i) + _, _, err := transformer.Transform(buf[:], strBytes[i:i+w], true) + if err != nil { + if invalidPos == -1 { + invalidPos = i + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:i], invalidPos + case TruncateStrategyReplace: + result = append(result, '?') + continue + } + } + if strategy == TruncateStrategyReplace { + result = append(result, strBytes[i:i+w]...) + } + } + if strategy == TruncateStrategyReplace { + return string(result), invalidPos + } + return str, -1 +} diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index fd6a4d062c467..4adfd916655db 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -16,6 +16,7 @@ package charset_test import ( "fmt" "testing" + "unicode/utf8" "github.com/pingcap/tidb/parser/charset" "github.com/stretchr/testify/require" @@ -91,3 +92,112 @@ func TestEncoding(t *testing.T) { require.Equal(t, tc.result, string(result), cmt) } } + +func TestStringValidatorASCII(t *testing.T) { + v := charset.StringValidatorASCII{} + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "", 2}, + {"qwÊrty", charset.TruncateStrategyTrim, "qw", 2}, + {"qwÊrty", charset.TruncateStrategyReplace, "qw?rty", 2}, + {"中文", charset.TruncateStrategyEmpty, "", 0}, + {"中文?qwert", charset.TruncateStrategyTrim, "", 0}, + {"中文?qwert", charset.TruncateStrategyReplace, "???qwert", 0}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } + require.Equal(t, -1, v.Validate("qwerty")) + require.Equal(t, 2, v.Validate("qwÊrty")) + require.Equal(t, 0, v.Validate("中文")) +} + +func TestStringValidatorUTF8(t *testing.T) { + // Test charset "utf8mb4". + v := charset.StringValidatorUTF8{IsUTF8MB4: true} + oxfffefd := string([]byte{0xff, 0xfe, 0xfd}) + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "qwÊrty", -1}, + {"qwÊ合法字符串", charset.TruncateStrategyEmpty, "qwÊ合法字符串", -1}, + {"😂", charset.TruncateStrategyEmpty, "😂", -1}, + {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, + {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, + {"中文" + oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } + // Test charset "utf8" with checking mb4 value. + v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true} + testCases = []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "qwÊrty", -1}, + {"qwÊ合法字符串", charset.TruncateStrategyEmpty, "qwÊ合法字符串", -1}, + {"😂", charset.TruncateStrategyEmpty, "", 0}, + {"😂", charset.TruncateStrategyReplace, "?", 0}, + {"valid_str😂", charset.TruncateStrategyReplace, "valid_str?", 9}, + {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, + {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, + {"中文" + oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } +} + +func TestStringValidatorGBK(t *testing.T) { + v := charset.StringValidatorOther{Charset: "gbk"} + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"asdf", charset.TruncateStrategyEmpty, "asdf", -1}, + {"中文", charset.TruncateStrategyEmpty, "中文", -1}, + {"À", charset.TruncateStrategyEmpty, "", 0}, + {"À", charset.TruncateStrategyReplace, "?", 0}, + {"中文À中文", charset.TruncateStrategyTrim, "中文", 6}, + {"中文À中文", charset.TruncateStrategyReplace, "中文?中文", 6}, + {"asdfÀ", charset.TruncateStrategyReplace, "asdf?", 4}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } +} diff --git a/table/column.go b/table/column.go index 827087ad1acf2..445a169a82b59 100644 --- a/table/column.go +++ b/table/column.go @@ -24,7 +24,6 @@ import ( "strings" "time" "unicode" - "unicode/utf8" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/expression" @@ -171,9 +170,8 @@ func truncateTrailingSpaces(v *types.Datum) { v.SetString(str, v.Collation()) } -func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) { +func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str string, i int) error { sc := ctx.GetSessionVars().StmtCtx - var strval strings.Builder for j := 0; j < 6; j++ { if len(str) > (i + j) { @@ -187,14 +185,11 @@ func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, cast if len(str) > i+6 { strval.WriteString(`...`) } - // TODO: Add 'at row %d' err := ErrTruncatedWrongValueForField.FastGen("Incorrect string value '%s' for column '%s'", strval.String(), col.Name) logutil.BgLogger().Error("incorrect string value", zap.Uint64("conn", ctx.GetSessionVars().ConnectionID), zap.Error(err)) - // Truncate to valid utf8 string. - truncateVal := types.NewStringDatum(str[:i]) err = sc.HandleTruncate(err) - return truncateVal, err + return err } func handleZeroDatetime(ctx sessionctx.Context, col *model.ColumnInfo, casted types.Datum, str string, tmIsInvalid bool) (types.Datum, bool, error) { @@ -319,61 +314,48 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r truncateTrailingSpaces(&casted) } - if col.Charset == charset.CharsetASCII { - if ctx.GetSessionVars().SkipASCIICheck { - return casted, nil - } - + if v := makeStringValidator(ctx, col); v != nil { str := casted.GetString() - for i := 0; i < len(str); i++ { - if str[i] > unicode.MaxASCII { - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } + strategy := charset.TruncateStrategyReplace + if val.Collation() == charset.CollationBin { + strategy = charset.TruncateStrategyTrim } - if forceIgnoreTruncate { - err = nil - } - return casted, err - } - - if ctx.GetSessionVars().SkipUTF8Check { - return casted, nil - } - - if !mysql.IsUTF8Charset(col.Charset) { - return casted, nil - } - str := casted.GetString() - utf8Charset := col.Charset == mysql.UTF8Charset - doMB4CharCheck := utf8Charset && config.GetGlobalConfig().CheckMb4ValueInUTF8 - fastCheck := (col.Charset == mysql.UTF8MB4Charset) && utf8.ValidString(str) - if !fastCheck { - // The following check is slow, if we fast check success, we can avoid this. - for i, w := 0, 0; i < len(str); i += w { - runeValue, width := utf8.DecodeRuneInString(str[i:]) - if runeValue == utf8.RuneError { - if strings.HasPrefix(str[i:], string(utf8.RuneError)) { - w = width - continue - } - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } else if width > 3 && doMB4CharCheck { - // Handle non-BMP characters. - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } - w = width + if newStr, invalidPos := v.Truncate(str, strategy); invalidPos >= 0 { + casted = types.NewStringDatum(newStr) + err = handleWrongCharsetValue(ctx, col, str, invalidPos) } } - if forceIgnoreTruncate { err = nil } return casted, err } +func makeStringValidator(ctx sessionctx.Context, col *model.ColumnInfo) charset.StringValidator { + switch col.Charset { + case charset.CharsetASCII: + if ctx.GetSessionVars().SkipASCIICheck { + return nil + } + return charset.StringValidatorASCII{} + case charset.CharsetUTF8: + if ctx.GetSessionVars().SkipUTF8Check { + return nil + } + needCheckMB4 := config.GetGlobalConfig().CheckMb4ValueInUTF8 + return charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} + case charset.CharsetUTF8MB4: + if ctx.GetSessionVars().SkipUTF8Check { + return nil + } + return charset.StringValidatorUTF8{IsUTF8MB4: true} + case charset.CharsetLatin1, charset.CharsetBinary: + return nil + default: + return charset.StringValidatorOther{Charset: col.Charset} + } +} + // ColDesc describes column information like MySQL desc and show columns do. type ColDesc struct { Field string