Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expression: convert charset by wrapping internal builtin function #29736

Merged
merged 20 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a53dd9f
expression: convert charset by wrapping implicit builtin function
tangenta Nov 12, 2021
65c57f6
expression/builtin_cast.go: fix comment of exported function
tangenta Nov 12, 2021
79fc364
expression/builtin_charset.go: fix vectored builtinCharsetConvSig for…
tangenta Nov 12, 2021
782fc31
expression: use wrapping method for length, ascii and to_base64
tangenta Nov 15, 2021
69cfec5
expression: wrap with convert_charset() in utf8 strings
tangenta Nov 15, 2021
f71aeb7
expression: fold constant recursively in convert_charset()
tangenta Nov 15, 2021
a8b5dca
Merge branch 'master' into convert-encoding
tangenta Nov 15, 2021
5f42077
expression: rename convert_charset to to_binary and support fold cons…
tangenta Nov 15, 2021
e9e2c73
Merge branch 'master' into convert-encoding
tangenta Nov 15, 2021
d271cec
expression: enable wrapping to_binary() for new charset only
tangenta Nov 16, 2021
4b08452
Merge branch 'master' into convert-encoding
tangenta Nov 16, 2021
bfe99e3
fix WrapWithToBinary
tangenta Nov 16, 2021
a62efa9
expression: add function class and add to_binary() to it for constant…
tangenta Nov 16, 2021
95ed705
Merge remote-tracking branch 'upstream/master' into convert-encoding
tangenta Nov 16, 2021
8a62098
test: add a test for to_binary constant folding
tangenta Nov 16, 2021
df296ce
gofmt: format code
tangenta Nov 16, 2021
c811e51
expression: use getFunction in WrapWithToBinary
tangenta Nov 16, 2021
9451434
expression: skip show to_binary builtin function
tangenta Nov 16, 2021
41263b9
expression: update tipb to set pbcode for to_binary
tangenta Nov 17, 2021
0e8bbf0
Merge branch 'master' into convert-encoding
ti-chi-bot Nov 17, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions expression/builtin.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ func newBaseBuiltinFuncWithTp(ctx sessionctx.Context, funcName string, args []Ex
args[i] = WrapWithCastAsDecimal(ctx, args[i])
case types.ETString:
args[i] = WrapWithCastAsString(ctx, args[i])
args[i] = WrapWithToBinary(ctx, args[i], funcName)
case types.ETDatetime:
args[i] = WrapWithCastAsTime(ctx, args[i], types.NewFieldType(mysql.TypeDatetime))
case types.ETTimestamp:
Expand Down Expand Up @@ -879,6 +880,9 @@ var funcs = map[string]functionClass{
ast.NextVal: &nextValFunctionClass{baseFunctionClass{ast.NextVal, 1, 1}},
ast.LastVal: &lastValFunctionClass{baseFunctionClass{ast.LastVal, 1, 1}},
ast.SetVal: &setValFunctionClass{baseFunctionClass{ast.SetVal, 2, 2}},

// TiDB implicit internal functions.
InternalFuncToBinary: &tidbConvertCharsetFunctionClass{baseFunctionClass{InternalFuncToBinary, 1, 1}},
}

// IsFunctionSupported check if given function name is a builtin sql function.
Expand All @@ -902,6 +906,7 @@ func GetDisplayName(name string) string {
func GetBuiltinList() []string {
res := make([]string, 0, len(funcs))
notImplementedFunctions := []string{ast.RowFunc, ast.IsTruthWithNull}
implicitFunctions := []string{InternalFuncToBinary}
for funcName := range funcs {
skipFunc := false
// Skip not implemented functions
Expand All @@ -910,6 +915,11 @@ func GetBuiltinList() []string {
skipFunc = true
}
}
for _, implicitFunc := range implicitFunctions {
if funcName == implicitFunc {
skipFunc = true
}
}
// Skip literal functions
// (their names are not readable: 'tidb`.(dateliteral, for example)
// See: https://github.com/pingcap/parser/pull/591
Expand Down
136 changes: 136 additions & 0 deletions expression/builtin_convert_charset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package expression

import (
"fmt"

"github.com/pingcap/tidb/parser/ast"
"github.com/pingcap/tidb/parser/charset"
"github.com/pingcap/tidb/parser/model"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tipb/go-tipb"
)

// InternalFuncToBinary accepts a string and returns another string encoded in a given charset.
const InternalFuncToBinary = "to_binary"

type tidbConvertCharsetFunctionClass struct {
baseFunctionClass
}

func (c *tidbConvertCharsetFunctionClass) getFunction(ctx sessionctx.Context, args []Expression) (builtinFunc, error) {
if err := c.verifyArgs(args); err != nil {
return nil, c.verifyArgs(args)
}
argTp := args[0].GetType().EvalType()
var sig builtinFunc
switch argTp {
case types.ETString:
tangenta marked this conversation as resolved.
Show resolved Hide resolved
bf, err := newBaseBuiltinFuncWithTp(ctx, c.funcName, args, types.ETString, types.ETString)
if err != nil {
return nil, err
}
sig = &builtinInternalToBinarySig{bf}
sig.setPbCode(tipb.ScalarFuncSig_ToBinary)
default:
return nil, fmt.Errorf("unexpected argTp: %d", argTp)
}
return sig, nil
}

var _ builtinFunc = &builtinInternalToBinarySig{}

type builtinInternalToBinarySig struct {
baseBuiltinFunc
}

func (b *builtinInternalToBinarySig) Clone() builtinFunc {
newSig := &builtinInternalToBinarySig{}
newSig.cloneFrom(&b.baseBuiltinFunc)
return newSig
}

func (b *builtinInternalToBinarySig) evalString(row chunk.Row) (res string, isNull bool, err error) {
val, isNull, err := b.args[0].EvalString(b.ctx, row)
if isNull || err != nil {
return res, isNull, err
}
tp := b.args[0].GetType()
enc := charset.NewEncoding(tp.Charset)
res, err = enc.EncodeString(val)
return res, false, err
}

func (b *builtinInternalToBinarySig) vectorized() bool {
return true
}

func (b *builtinInternalToBinarySig) vecEvalString(input *chunk.Chunk, result *chunk.Column) error {
n := input.NumRows()
buf, err := b.bufAllocator.get()
if err != nil {
return err
}
defer b.bufAllocator.put(buf)
if err := b.args[0].VecEvalString(b.ctx, input, buf); err != nil {
return err
}
enc := charset.NewEncoding(b.args[0].GetType().Charset)
result.ReserveString(n)
for i := 0; i < n; i++ {
var str string
if buf.IsNull(i) {
result.AppendNull()
continue
}
str = buf.GetString(i)
str, err = enc.EncodeString(str)
if err != nil {
return err
}
result.AppendString(str)
}
return nil
}

// toBinaryMap contains the builtin functions which arguments need to be converted to the correct charset.
var toBinaryMap = map[string]struct{}{
ast.Hex: {}, ast.Length: {}, ast.OctetLength: {}, ast.ASCII: {},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about other functions like upper

Copy link
Contributor Author

@tangenta tangenta Nov 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add them later in the next PR.

ast.ToBase64: {},
}

// WrapWithToBinary wraps `expr` with to_binary sig.
func WrapWithToBinary(ctx sessionctx.Context, expr Expression, funcName string) Expression {
exprTp := expr.GetType()
if _, err := charset.GetDefaultCollationLegacy(exprTp.Charset); err != nil {
if _, ok := toBinaryMap[funcName]; ok {
fc := funcs[InternalFuncToBinary]
sig, err := fc.getFunction(ctx, []Expression{expr})
if err != nil {
return expr
}
sf := &ScalarFunction{
FuncName: model.NewCIStr(InternalFuncToBinary),
RetType: exprTp,
Function: sig,
}
return FoldConstant(sf)
}
}
return expr
}
34 changes: 2 additions & 32 deletions expression/builtin_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,6 @@ func (b *builtinLengthSig) evalInt(row chunk.Row) (int64, bool, error) {
if isNull || err != nil {
return 0, isNull, err
}

argTp := b.args[0].GetType()
if !types.IsBinaryStr(argTp) {
dBytes, err := charset.NewEncoding(argTp.Charset).EncodeString(val)
if err == nil {
return int64(len(dBytes)), false, nil
}
}

return int64(len([]byte(val))), false, nil
}

Expand Down Expand Up @@ -272,13 +263,6 @@ func (b *builtinASCIISig) evalInt(row chunk.Row) (int64, bool, error) {
if len(val) == 0 {
return 0, false, nil
}
argTp := b.args[0].GetType()
if !types.IsBinaryStr(argTp) {
dBytes, err := charset.NewEncoding(argTp.Charset).EncodeString(val)
if err == nil {
return int64(dBytes[0]), false, nil
}
}
return int64(val[0]), false, nil
}

Expand Down Expand Up @@ -1664,7 +1648,7 @@ func (c *hexFunctionClass) getFunction(ctx sessionctx.Context, args []Expression
argFieldTp := args[0].GetType()
// Use UTF8MB4 as default.
bf.tp.Flen = argFieldTp.Flen * 4 * 2
sig := &builtinHexStrArgSig{bf, charset.NewEncoding(argFieldTp.Charset)}
sig := &builtinHexStrArgSig{bf}
sig.setPbCode(tipb.ScalarFuncSig_HexStrArg)
return sig, nil
case types.ETInt, types.ETReal, types.ETDecimal:
Expand All @@ -1684,15 +1668,11 @@ func (c *hexFunctionClass) getFunction(ctx sessionctx.Context, args []Expression

type builtinHexStrArgSig struct {
baseBuiltinFunc
encoding *charset.Encoding
}

func (b *builtinHexStrArgSig) Clone() builtinFunc {
newSig := &builtinHexStrArgSig{}
newSig.cloneFrom(&b.baseBuiltinFunc)
if b.encoding != nil {
newSig.encoding = charset.NewEncoding(b.encoding.Name())
}
return newSig
}

Expand All @@ -1703,12 +1683,7 @@ func (b *builtinHexStrArgSig) evalString(row chunk.Row) (string, bool, error) {
if isNull || err != nil {
return d, isNull, err
}
dBytes := hack.Slice(d)
dBytes, err = b.encoding.Encode(nil, dBytes)
if err != nil {
return d, false, err
}
return strings.ToUpper(hex.EncodeToString(dBytes)), false, nil
return strings.ToUpper(hex.EncodeToString(hack.Slice(d))), false, nil
}

type builtinHexIntArgSig struct {
Expand Down Expand Up @@ -3634,11 +3609,6 @@ func (b *builtinToBase64Sig) evalString(row chunk.Row) (d string, isNull bool, e
if isNull || err != nil {
return "", isNull, err
}
argTp := b.args[0].GetType()
str, err = charset.NewEncoding(argTp.Charset).EncodeString(str)
if err != nil {
return "", false, err
}
needEncodeLen := base64NeededEncodedLength(len(str))
if needEncodeLen == -1 {
return "", true, nil
Expand Down
44 changes: 2 additions & 42 deletions expression/builtin_string_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,6 @@ func (b *builtinHexStrArgSig) vecEvalString(input *chunk.Chunk, result *chunk.Co
return err
}
defer b.bufAllocator.put(buf0)
var encodedBuf []byte
if err := b.args[0].VecEvalString(b.ctx, input, buf0); err != nil {
return err
}
Expand All @@ -457,13 +456,7 @@ func (b *builtinHexStrArgSig) vecEvalString(input *chunk.Chunk, result *chunk.Co
result.AppendNull()
continue
}
buf0Bytes := buf0.GetBytes(i)
encodedBuf, err = b.encoding.Encode(encodedBuf, buf0Bytes)
if err != nil {
return err
}
buf0Bytes = encodedBuf
result.AppendString(strings.ToUpper(hex.EncodeToString(buf0Bytes)))
result.AppendString(strings.ToUpper(hex.EncodeToString(buf0.GetBytes(i))))
}
return nil
}
Expand Down Expand Up @@ -912,11 +905,6 @@ func (b *builtinASCIISig) vecEvalInt(input *chunk.Chunk, result *chunk.Column) e
if err = b.args[0].VecEvalString(b.ctx, input, buf); err != nil {
return err
}

argTp := b.args[0].GetType()
enc := charset.NewEncoding(argTp.Charset)
isBinaryStr := types.IsBinaryStr(argTp)

result.ResizeInt64(n, false)
result.MergeNulls(buf)
i64s := result.Int64s()
Expand All @@ -929,14 +917,6 @@ func (b *builtinASCIISig) vecEvalInt(input *chunk.Chunk, result *chunk.Column) e
i64s[i] = 0
continue
}
if !isBinaryStr {
dBytes, err := enc.EncodeString(str)
if err != nil {
return err
}
i64s[i] = int64(dBytes[0])
continue
}
i64s[i] = int64(str[0])
}
return nil
Expand Down Expand Up @@ -2162,27 +2142,14 @@ func (b *builtinLengthSig) vecEvalInt(input *chunk.Chunk, result *chunk.Column)
return err
}

argTp := b.args[0].GetType()
enc := charset.NewEncoding(argTp.Charset)
isBinaryStr := types.IsBinaryStr(argTp)

result.ResizeInt64(n, false)
result.MergeNulls(buf)
i64s := result.Int64s()
var encodeBuf []byte
for i := 0; i < n; i++ {
if result.IsNull(i) {
continue
}
str := buf.GetBytes(i)
if !isBinaryStr {
dBytes, err := enc.Encode(encodeBuf, str)
if err != nil {
return err
}
i64s[i] = int64(len(dBytes))
continue
}
i64s[i] = int64(len(str))
}
return nil
Expand Down Expand Up @@ -2470,20 +2437,13 @@ func (b *builtinToBase64Sig) vecEvalString(input *chunk.Chunk, result *chunk.Col
if err := b.args[0].VecEvalString(b.ctx, input, buf); err != nil {
return err
}

argTp := b.args[0].GetType()
enc := charset.NewEncoding(argTp.Charset)

result.ReserveString(n)
for i := 0; i < n; i++ {
if buf.IsNull(i) {
result.AppendNull()
continue
}
str, err := enc.EncodeString(buf.GetString(i))
if err != nil {
return err
}
str := buf.GetString(i)
needEncodeLen := base64NeededEncodedLength(len(str))
if needEncodeLen == -1 {
result.AppendNull()
Expand Down
Loading