Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

table, parser: check for invalid GBK characters before insertion #28814

Merged
merged 25 commits into from
Nov 22, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8783c7d
table, parser: check for invalid GBK characters before insertion
tangenta Oct 14, 2021
76da0c5
fix encode char length calculation
tangenta Oct 14, 2021
062beea
format parser/charset/encoding_table.go
tangenta Oct 14, 2021
8b2e958
address comments
tangenta Oct 15, 2021
4a17d28
remove unnecessary field
tangenta Oct 15, 2021
3097c3a
format file
tangenta Oct 15, 2021
ad7cb70
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Oct 21, 2021
9ddd077
Merge branch 'master' into gbk-invalid
tangenta Nov 1, 2021
dda97b3
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 1, 2021
e8f180a
replace the invalid chars with '?'
tangenta Nov 2, 2021
13cf296
fix the truncation for ascii
tangenta Nov 2, 2021
650b63b
fix integration test TestStringValidatorUTF8
tangenta Nov 2, 2021
7a74b29
fix truncate algo
tangenta Nov 13, 2021
b1911e7
fix go import order
tangenta Nov 13, 2021
35e203f
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 13, 2021
c5ed6a8
fix integration test TestInsertWrongValueForField
tangenta Nov 13, 2021
ddb172a
Merge branch 'master' into gbk-invalid
tangenta Nov 18, 2021
fefc72c
Merge branch 'master' into gbk-invalid
tangenta Nov 18, 2021
bbded58
parser/charset: avoid unnecessary copying for valid ASCII string
tangenta Nov 19, 2021
3b685fc
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 19, 2021
7743db1
fix truncate bugs and add test
tangenta Nov 20, 2021
1b917dd
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 20, 2021
74f1b49
charset: make format and address comment
tangenta Nov 22, 2021
25bb576
Merge branch 'master' into gbk-invalid
ti-chi-bot Nov 22, 2021
48dcf12
Merge branch 'master' into gbk-invalid
ti-chi-bot Nov 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ cmd/explaintest/explain-test.out
cmd/explaintest/explaintest_tidb-server
cmd/explaintest/portgenerator
cmd/explaintest/s/
cmd/explaintest/importer
cmd/pluginpkg/pluginpkg
*.fail.go
tools/bin/
vendor
Expand Down
19 changes: 19 additions & 0 deletions cmd/explaintest/r/new_character_set_invalid.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk);
insert into t values ('中文');
insert into t values ('À');
Error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('中文À中文');
Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
select a from t;
a
中文
set @@sql_mode = '';
insert into t values ('À');
insert into t values ('中文À中文');
select a from t;
a
中文

中文
tangenta marked this conversation as resolved.
Show resolved Hide resolved
14 changes: 14 additions & 0 deletions cmd/explaintest/t/new_character_set_invalid.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk);
insert into t values ('中文');
-- error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('À');
-- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
insert into t values ('中文À中文');
select a from t;

set @@sql_mode = '';
insert into t values ('À');
insert into t values ('中文À中文');
select a from t;
11 changes: 2 additions & 9 deletions expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression)
func isValidString(str string, dstChs string) bool {
switch dstChs {
case charset.CharsetASCII:
for _, c := range str {
if c >= 0x80 {
return false
}
}
return true
return charset.StringValidatorASCII{}.Validate(str) == -1
case charset.CharsetLatin1:
// For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin;
return true
Expand All @@ -355,9 +350,7 @@ func isValidString(str string, dstChs string) bool {
// Convert to binary is always safe.
return true
default:
e, _ := charset.Lookup(dstChs)
_, err := e.NewEncoder().String(str)
return err == nil
return charset.StringValidatorOther{Charset: dstChs}.Validate(str) == -1
}
}

Expand Down
19 changes: 19 additions & 0 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,25 @@ func (e *Encoding) DecodeString(src string) (string, error) {
return string(bs), err
}


// IsValid checks whether src(utf8) bytes can be encode into a string with given charset.
// Return -1 if it decodes successfully.
func (e *Encoding) IsValid(src []byte) (invalidPos int) {
tangenta marked this conversation as resolved.
Show resolved Hide resolved
dec := e.enc.NewEncoder()
dest := [4]byte{}
var srcOffset int
for srcOffset < len(src) {
srcNextLen := characterLengthUTF8(src[srcOffset:])
srcEnd := mathutil.Min(srcOffset+srcNextLen, len(src))
_, nSrc, err := dec.Transform(dest[:], src[srcOffset:srcEnd], false)
if err != nil {
return srcOffset
}
srcOffset += nSrc
}
return -1
}

func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) {
if len(dest) < len(src) {
dest = make([]byte, len(src)*2)
Expand Down
71 changes: 71 additions & 0 deletions parser/charset/encoding_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ package charset

import (
"strings"
go_unicode "unicode"
"unicode/utf8"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
Expand Down Expand Up @@ -298,3 +300,72 @@ func characterLengthUTF8(bs []byte) int {
}
return 4
}

var _ StringValidator = StringValidatorASCII{}
var _ StringValidator = StringValidatorUTF8{}
var _ StringValidator = StringValidatorOther{}

// StringValidator is used to check if a string is valid in the specific charset.
type StringValidator interface {
Validate(str string) (invalidPos int)
}

// StringValidatorASCII checks whether a string is valid ASCII string.
type StringValidatorASCII struct{}

// Validate checks whether the string is valid in the given charset.
// It returns the first invalid byte offset.
func (s StringValidatorASCII) Validate(str string) (invalidPos int) {
for i := 0; i < len(str); i++ {
if str[i] > go_unicode.MaxASCII {
return i
}
}
return -1
}

// StringValidatorUTF8 checks whether a string is valid UTF8 string.
type StringValidatorUTF8 struct {
IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4"
CheckMB4ValueInUTF8 bool
}

// Validate checks whether the string is valid in the given charset.
// It returns the first invalid byte offset.
func (s StringValidatorUTF8) Validate(str string) (invalidPos int) {
if s.IsUTF8MB4 && utf8.ValidString(str) {
// Quick check passed.
return -1
}
doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8
for i, w := 0, 0; i < len(str); i += w {
runeValue, width := utf8.DecodeRuneInString(str[i:])
if runeValue == utf8.RuneError {
if strings.HasPrefix(str[i:], string(utf8.RuneError)) {
w = width
continue
}
return i
} else if width > 3 && doMB4CharCheck {
// Meet non-BMP characters.
return i
}
w = width
}
return -1
}

// StringValidatorOther checks whether a string is valid string in given charset.
type StringValidatorOther struct {
Charset string
}

// Validate checks whether the string is valid in the given charset.
// It returns the first invalid byte offset.
func (s StringValidatorOther) Validate(str string) (invalidPos int) {
enc := NewEncoding(s.Charset)
if !enc.enabled() {
return -1
}
return enc.IsValid([]byte(str))
}
39 changes: 39 additions & 0 deletions parser/charset/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,42 @@ func (s *testEncodingSuite) TestEncoding(c *C) {
c.Assert(string(result), Equals, tc.result, cmt)
}
}

func (s *testEncodingSuite) TestValidatorASCII(c *C) {
v := charset.StringValidatorASCII{}
c.Assert(v.Validate("qwerty"), Equals, -1)
c.Assert(v.Validate("qwÊrty"), Equals, 2)
c.Assert(v.Validate("中文"), Equals, 0)
}

func (s *testEncodingSuite) TestValidatorUTF8(c *C) {
// Test charset "utf8mb4".
v := charset.StringValidatorUTF8{IsUTF8MB4: true}
c.Assert(v.Validate("qwerty"), Equals, -1)
c.Assert(v.Validate("qwÊrty"), Equals, -1)
c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1)
c.Assert(v.Validate("😂"), Equals, -1)
invalid := string([]byte{0xff, 0xfe, 0xfd})
c.Assert(v.Validate(invalid), Equals, 0)
// Test charset "utf8" without checking mb4 value.
v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false}
c.Assert(v.Validate("qwerty"), Equals, -1)
c.Assert(v.Validate("qwÊrty"), Equals, -1)
c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1)
c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1)
c.Assert(v.Validate("😂"), Equals, -1)
c.Assert(v.Validate(invalid), Equals, 0)
// Test charset "utf8" with checking mb4 value.
v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true}
c.Assert(v.Validate("😂"), Equals, 0) // 4-bytes character is invalid.
c.Assert(v.Validate(invalid), Equals, 0)
}

func (s *testEncodingSuite) TestValidatorGBK(c *C) {
v := charset.StringValidatorOther{Charset: "gbk"}
c.Assert(v.Validate("asdf"), Equals, -1)
c.Assert(v.Validate("中文"), Equals, -1)
c.Assert(v.Validate("À"), Equals, 0)
c.Assert(v.Validate("asdfÀ"), Equals, 4)
c.Assert(v.Validate("中文À"), Equals, 6)
}
77 changes: 29 additions & 48 deletions table/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"strings"
"time"
"unicode"
"unicode/utf8"

"github.com/pingcap/tidb/config"
"github.com/pingcap/tidb/expression"
Expand Down Expand Up @@ -171,7 +170,7 @@ func truncateTrailingSpaces(v *types.Datum) {
v.SetString(str, v.Collation())
}

func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) {
func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str string, i int) (types.Datum, error) {
sc := ctx.GetSessionVars().StmtCtx

var strval strings.Builder
Expand Down Expand Up @@ -319,61 +318,43 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r
truncateTrailingSpaces(&casted)
}

if col.Charset == charset.CharsetASCII {
if ctx.GetSessionVars().SkipASCIICheck {
return casted, nil
}

if v := makeStringValidator(ctx, col); v != nil {
str := casted.GetString()
for i := 0; i < len(str); i++ {
if str[i] > unicode.MaxASCII {
casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
}
}
if forceIgnoreTruncate {
err = nil
if invalidPos := v.Validate(str); invalidPos >= 0 {
casted, err = handleWrongCharsetValue(ctx, col, str, invalidPos)
}
return casted, err
}

if ctx.GetSessionVars().SkipUTF8Check {
return casted, nil
}

if !mysql.IsUTF8Charset(col.Charset) {
return casted, nil
}
str := casted.GetString()
utf8Charset := col.Charset == mysql.UTF8Charset
doMB4CharCheck := utf8Charset && config.GetGlobalConfig().CheckMb4ValueInUTF8
fastCheck := (col.Charset == mysql.UTF8MB4Charset) && utf8.ValidString(str)
if !fastCheck {
// The following check is slow, if we fast check success, we can avoid this.
for i, w := 0, 0; i < len(str); i += w {
runeValue, width := utf8.DecodeRuneInString(str[i:])
if runeValue == utf8.RuneError {
if strings.HasPrefix(str[i:], string(utf8.RuneError)) {
w = width
continue
}
casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
} else if width > 3 && doMB4CharCheck {
// Handle non-BMP characters.
casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i)
break
}
w = width
}
}

if forceIgnoreTruncate {
err = nil
}
return casted, err
}

func makeStringValidator(ctx sessionctx.Context, col *model.ColumnInfo) charset.StringValidator {
switch col.Charset {
case charset.CharsetASCII:
tangenta marked this conversation as resolved.
Show resolved Hide resolved
if ctx.GetSessionVars().SkipASCIICheck {
return nil
}
return charset.StringValidatorASCII{}
case charset.CharsetUTF8:
if ctx.GetSessionVars().SkipUTF8Check {
return nil
}
needCheckMB4 := config.GetGlobalConfig().CheckMb4ValueInUTF8
return charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4}
case charset.CharsetUTF8MB4:
if ctx.GetSessionVars().SkipUTF8Check {
return nil
}
return charset.StringValidatorUTF8{IsUTF8MB4: true}
case charset.CharsetLatin1, charset.CharsetBinary:
return nil
default:
return charset.StringValidatorOther{Charset: col.Charset}
}
}

// ColDesc describes column information like MySQL desc and show columns do.
type ColDesc struct {
Field string
Expand Down