Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the performance of length and ascii functions #9345

Merged
merged 12 commits into from
Sep 3, 2024
2 changes: 1 addition & 1 deletion dbms/src/Functions/FunctionsNull.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class FunctionCoalesce : public IFunction
public:
static constexpr auto name = "coalesce";
static FunctionPtr create(const Context & context);
FunctionCoalesce(const Context & context)
explicit FunctionCoalesce(const Context & context)
: context(context)
{}

Expand Down
64 changes: 35 additions & 29 deletions dbms/src/Functions/FunctionsString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <Columns/ColumnArray.h>
#include <Columns/IColumn.h>
#include <Common/TargetSpecific.h>
#include <Common/UTF8Helpers.h>
#include <Common/Volnitsky.h>
Expand All @@ -35,6 +36,7 @@
#include <ext/range.h>
#include <magic_enum.hpp>


namespace DB
{
namespace ErrorCodes
Expand Down Expand Up @@ -4474,9 +4476,11 @@ class FunctionASCII : public IFunction
std::string getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }

bool useDefaultImplementationForConstants() const override { return true; }

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() != 1)
if unlikely (arguments.size() != 1)
throw Exception(
fmt::format(
"Number of arguments for function {} doesn't match: passed {}, should be 1.",
Expand All @@ -4490,30 +4494,31 @@ class FunctionASCII : public IFunction
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
{
const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);

Field res_field;
int val_num = c0_col->size();
auto col_res = ColumnInt64::create();
col_res->reserve(val_num);
if (c0_const == nullptr && c0_string == nullptr)
if unlikely (c0_string == nullptr)
throw Exception(
fmt::format("Illegal argument of function {}", getName()),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

for (int i = 0; i < val_num; i++)
{
c0_col->get(i, res_field);
String handled_str = res_field.get<String>();
Int64 res = handled_str.empty() ? 0 : static_cast<Int64>(handled_str[0]);
col_res->insert(res);
}
size_t val_num = c0_col->size();
auto col_res = ColumnInt64::create();
col_res->reserve(val_num);
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved

const auto & chars = c0_string->getChars();
const auto & offsets = c0_string->getOffsets();

if (val_num > 0)
col_res->insert(getResult(chars, 0));


for (size_t i = 1; i < val_num; i++)
col_res->insert(getResult(chars, offsets[i - 1]));

block.getByPosition(result).column = std::move(col_res);
}

private:
static Int64 getResult(const ColumnString::Chars_t & chars, size_t offset) { return chars[offset]; }
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
};

class FunctionLength : public IFunction
Expand All @@ -4527,9 +4532,11 @@ class FunctionLength : public IFunction
std::string getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }

bool useDefaultImplementationForConstants() const override { return true; }

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() != 1)
if unlikely (arguments.size() != 1)
throw Exception(
fmt::format(
"Number of arguments for function {} doesn't match: passed {}, should be 1.",
Expand All @@ -4543,24 +4550,23 @@ class FunctionLength : public IFunction
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override
{
const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);

Field res_field;
int val_num = c0_col->size();
auto col_res = ColumnInt64::create();
col_res->reserve(val_num);
if (c0_const == nullptr && c0_string == nullptr)
if unlikely (c0_string == nullptr)
throw Exception(
fmt::format("Illegal argument of function {}", getName()),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

for (int i = 0; i < val_num; i++)
{
c0_col->get(i, res_field);
String handled_str = res_field.get<String>();
col_res->insert(static_cast<Int64>(handled_str.size()));
}
size_t val_num = c0_col->size();
auto col_res = ColumnInt64::create();
col_res->reserve(val_num);
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved

const auto & offsets = c0_string->getOffsets();

if (val_num > 0)
col_res->insert(offsets[0] - 1);

for (size_t i = 1; i < val_num; i++)
col_res->insert(offsets[i] - offsets[i - 1] - 1);

block.getByPosition(result).column = std::move(col_res);
}
Expand Down
33 changes: 17 additions & 16 deletions dbms/src/Functions/GatherUtils/Algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ void concat(const std::vector<std::unique_ptr<IArraySource>> & array_sources, Si
size_t sources_num = array_sources.size();
std::vector<char> is_const(sources_num);

auto checkAndGetSizeToReserve = [](auto source, IArraySource * array_source) {
auto check_and_get_size_to_reserve = [](auto source, IArraySource * array_source) {
if (source == nullptr)
throw Exception(
"Concat function expected " + demangle(typeid(Source).name()) + " or "
Expand All @@ -205,17 +205,18 @@ void concat(const std::vector<std::unique_ptr<IArraySource>> & array_sources, Si
size_t size_to_reserve = 0;
for (auto i : ext::range(0, sources_num))
{
auto & source = array_sources[i];
const auto & source = array_sources[i];
is_const[i] = source->isConst();
if (is_const[i])
size_to_reserve += checkAndGetSizeToReserve(typeid_cast<ConstSource<Source> *>(source.get()), source.get());
size_to_reserve
+= check_and_get_size_to_reserve(typeid_cast<ConstSource<Source> *>(source.get()), source.get());
else
size_to_reserve += checkAndGetSizeToReserve(typeid_cast<Source *>(source.get()), source.get());
size_to_reserve += check_and_get_size_to_reserve(typeid_cast<Source *>(source.get()), source.get());
}

sink.reserve(size_to_reserve);

auto writeNext = [&sink](auto source) {
auto write_next = [&sink](auto source) {
writeSlice(source->getWhole(), sink);
source->next();
};
Expand All @@ -224,11 +225,11 @@ void concat(const std::vector<std::unique_ptr<IArraySource>> & array_sources, Si
{
for (auto i : ext::range(0, sources_num))
{
auto & source = array_sources[i];
const auto & source = array_sources[i];
if (is_const[i])
writeNext(static_cast<ConstSource<Source> *>(source.get()));
write_next(static_cast<ConstSource<Source> *>(source.get()));
else
writeNext(static_cast<Source *>(source.get()));
write_next(static_cast<Source *>(source.get()));
}
sink.next();
}
Expand Down Expand Up @@ -389,11 +390,11 @@ void NO_INLINE pad(SourceA && src, SourceB && padding, Sink && sink, ssize_t len
size_t left = static_cast<size_t>(length) - slice.size;
if (is_left)
{
StringSource::Slice padSlice = padding.getWhole();
while (left > padSlice.size && padSlice.size != 0)
StringSource::Slice pad_slice = padding.getWhole();
while (left > pad_slice.size && pad_slice.size != 0)
{
writeSlice(padSlice, sink);
left -= padSlice.size;
writeSlice(pad_slice, sink);
left -= pad_slice.size;
}

writeSlice(padding.getSliceFromLeft(0, left), sink);
Expand All @@ -402,11 +403,11 @@ void NO_INLINE pad(SourceA && src, SourceB && padding, Sink && sink, ssize_t len
else
{
writeSlice(slice, sink);
StringSource::Slice padSlice = padding.getWhole();
while (left > padSlice.size && padSlice.size != 0)
StringSource::Slice pad_slice = padding.getWhole();
while (left > pad_slice.size && pad_slice.size != 0)
{
writeSlice(padSlice, sink);
left -= padSlice.size;
writeSlice(pad_slice, sink);
left -= pad_slice.size;
}

writeSlice(padding.getSliceFromLeft(0, left), sink);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include <TestUtils/FunctionTestUtils.h>
#include <benchmark/benchmark.h>

/// this is a hack, include the cpp file so we can test MatchImpl directly
#include <Functions/FunctionsString.cpp>
/// this is a hack, include the cpp file so we can test functions directly
#include <Functions/FunctionsString.cpp> // NOLINT
#include <Functions/FunctionsStringSearch.cpp> // NOLINT

namespace DB
Expand Down Expand Up @@ -306,5 +306,65 @@ BENCH_LIKE_COLLATOR(ASCII_BIN);
BENCH_LIKE_COLLATOR(BINARY);
BENCH_LIKE_COLLATOR(LATIN1_BIN);

class LengthBench : public benchmark::Fixture
{
public:
using ColStringType = typename TypeTraits<String>::FieldType;

ColumnsWithTypeAndName data1{toVec<String>("col", std::vector<ColStringType>(data_num, ""))};
ColumnsWithTypeAndName data2{toVec<String>("col", std::vector<ColStringType>(data_num, "aaaaaaaaaa"))};
ColumnsWithTypeAndName data3{toVec<String>("col", std::vector<ColStringType>(data_num, "啊aaaaaaaa"))};

void SetUp(const benchmark::State &) override {}
};

BENCHMARK_DEFINE_F(LengthBench, bench)
(benchmark::State & state)
try
{
FunctionLength function_length;
std::vector<Block> blocks{Block(data1), Block(data2), Block(data3)};
for (auto & block : blocks)
block.insert({nullptr, std::make_shared<DataTypeNumber<UInt8>>(), "res"});
ColumnNumbers arguments{0};
for (auto _ : state)
{
for (auto & block : blocks)
function_length.executeImpl(block, arguments, 1);
}
}
CATCH
BENCHMARK_REGISTER_F(LengthBench, bench)->Iterations(10);

class ASCIIBench : public benchmark::Fixture
{
public:
using ColStringType = typename TypeTraits<String>::FieldType;

ColumnsWithTypeAndName data1{toVec<String>("col", std::vector<ColStringType>(data_num, ""))};
ColumnsWithTypeAndName data2{toVec<String>("col", std::vector<ColStringType>(data_num, "aaaaaaaaaa"))};
ColumnsWithTypeAndName data3{toVec<String>("col", std::vector<ColStringType>(data_num, "啊aaaaaaaa"))};

void SetUp(const benchmark::State &) override {}
};

BENCHMARK_DEFINE_F(ASCIIBench, bench)
(benchmark::State & state)
try
{
FunctionASCII function_ascii;
std::vector<Block> blocks{Block(data1), Block(data2), Block(data3)};
for (auto & block : blocks)
block.insert({nullptr, std::make_shared<DataTypeNumber<UInt8>>(), "res"});
ColumnNumbers arguments{0};
for (auto _ : state)
{
for (auto & block : blocks)
function_ascii.executeImpl(block, arguments, 1);
}
}
CATCH
BENCHMARK_REGISTER_F(ASCIIBench, bench)->Iterations(10);

} // namespace tests
} // namespace DB
14 changes: 7 additions & 7 deletions dbms/src/Functions/tests/gtest_strings_ascii.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ TEST_F(StringASCII, strAndStrTest)
test_block.insert({nullptr, func->getReturnType(), "res"});
func->execute(test_block, cns, 1);
const IColumn * res = test_block.getByPosition(1).column.get();
const ColumnInt64 * res_string = checkAndGetColumn<ColumnInt64>(res);
const auto * actual_res = checkAndGetColumn<ColumnInt64>(res);

Field res_field;
std::vector<Int64> results{104, 72, 50, 35, 0};
for (size_t t = 0; t < results.size(); t++)
std::vector<Int64> expect_results{104, 72, 50, 35, 0};
for (size_t t = 0; t < expect_results.size(); t++)
{
res_string->get(t, res_field);
actual_res->get(t, res_field);
Int64 res_val = res_field.get<Int64>();
EXPECT_EQ(results[t], res_val);
EXPECT_EQ(expect_results[t], res_val);
}
}
}
Expand Down Expand Up @@ -133,7 +133,7 @@ TEST_F(StringASCII, nullTest)
MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate();
NullMap & result_null_map = static_cast<ColumnUInt8 &>(*mutable_result_null_map_column).getData();
const IColumn * res = test_block.getByPosition(1).column.get();
const ColumnNullable * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const auto * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const IColumn & res_string = res_nullable_string->getNestedColumn();

Field res_field;
Expand All @@ -151,4 +151,4 @@ TEST_F(StringASCII, nullTest)
}

} // namespace tests
} // namespace DB
} // namespace DB
8 changes: 4 additions & 4 deletions dbms/src/Functions/tests/gtest_strings_length.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ TEST_F(StringLength, strAndStrTest)

auto & factory = FunctionFactory::instance();

std::vector<String> strs{"hi~", "23333", "pingcap", "你好", "233哈哈", ""};
std::vector<Int64> results{3, 5, 7, 6, 9, 0};
std::vector<String> strs{"hi~", "23333", "pingcap", "你好", "233哈哈", "", "asdの的", "ヽ( ̄▽ ̄)و"};
std::vector<Int64> results{3, 5, 7, 6, 9, 0, 9, 16};

for (int i = 0; i < 2; i++)
{
Expand Down Expand Up @@ -74,7 +74,7 @@ TEST_F(StringLength, strAndStrTest)
test_block.insert({nullptr, func->getReturnType(), "res"});
func->execute(test_block, cns, 1);
const IColumn * res = test_block.getByPosition(1).column.get();
const ColumnInt64 * res_string = checkAndGetColumn<ColumnInt64>(res);
const auto * res_string = checkAndGetColumn<ColumnInt64>(res);

Field res_field;

Expand Down Expand Up @@ -134,7 +134,7 @@ TEST_F(StringLength, nullTest)
MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate();
NullMap & result_null_map = static_cast<ColumnUInt8 &>(*mutable_result_null_map_column).getData();
const IColumn * res = test_block.getByPosition(1).column.get();
const ColumnNullable * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const auto * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const IColumn & res_string = res_nullable_string->getNestedColumn();

Field res_field;
Expand Down
6 changes: 3 additions & 3 deletions dbms/src/Functions/tests/gtest_strings_position.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ TEST_F(StringPosition, strAndStrTest)

bp->build(ctns)->execute(test_block, cns, 2);
const IColumn * res = test_block.getByPosition(2).column.get();
const ColumnInt64 * res_string = checkAndGetColumn<ColumnInt64>(res);
const auto * res_string = checkAndGetColumn<ColumnInt64>(res);

Field res_field;

Expand Down Expand Up @@ -162,7 +162,7 @@ TEST_F(StringPosition, utf8StrAndStrTest)

bp->build(ctns)->execute(test_block, cns, 2);
const IColumn * res = test_block.getByPosition(2).column.get();
const ColumnInt64 * res_string = checkAndGetColumn<ColumnInt64>(res);
const auto * res_string = checkAndGetColumn<ColumnInt64>(res);

Field res_field;

Expand Down Expand Up @@ -236,7 +236,7 @@ TEST_F(StringPosition, nullTest)
MutableColumnPtr mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate();
NullMap & result_null_map = static_cast<ColumnUInt8 &>(*mutable_result_null_map_column).getData();
const IColumn * res = test_block.getByPosition(2).column.get();
const ColumnNullable * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const auto * res_nullable_string = checkAndGetColumn<ColumnNullable>(res);
const IColumn & res_string = res_nullable_string->getNestedColumn();

Field res_field;
Expand Down
Loading