Skip to content

Commit

Permalink
feat(codec): encode/decode map value to codec row (#3713)
Browse files Browse the repository at this point in the history
* feat(engine): array & map data type interface for row level

This updated proto defines for `hybridse::type::ColumnDef`, old data
type field `ColumnDef::type`, as well as `ColumnDef::is_not_null` consider deprecated, and `ColumnDef::schema` consider the new data type
for new codes. Old fields still workable but may going to removed in further release.

* feat(codec): encode map type into row

* feat: rm ImmutableMap
  • Loading branch information
aceforeverd authored Feb 29, 2024
1 parent 26f80b8 commit 7f1bd83
Show file tree
Hide file tree
Showing 48 changed files with 2,220 additions and 1,042 deletions.
37 changes: 34 additions & 3 deletions cases/query/udf_query.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -572,11 +572,12 @@ cases:
map('1', 2, '3', 4, '5', 6, '7', 8, '9', 10, '11', 12)['10'] as e8,
# first match on duplicate keys
map('1', 2, '1', 4, '1', 6, '7', 8, '9', 10, '11', 12)['1'] as e9,
map("c", 99, "d", NULL)["d"] as e10,
# map("c", 99, "d", NULL)["d"] as e10,
expect:
columns: ["e1 string", "e2 int", "e3 string", "e4 int", "e5 string", "e6 timestamp", "e7 int", "e8 int", "e9 int", "e10 int"]
# FIXME
columns: ["e1 string", "e2 int", "e3 string", "e4 int", "e5 string", "e6 timestamp", "e7 int", "e8 int", "e9 int"]
data: |
2, 100, NULL, 101, f, 2000, 10, NULL, 2, NULL
2, 100, NULL, 101, f, 2000, 10, NULL, 2
- id: 14
mode: request-unsupport
sql: |
Expand All @@ -588,3 +589,33 @@ cases:
columns: ["e1 bool", "e2 bool", "e3 bool"]
data: |
true, false, true
- id: 15
mode: request-unsupport
sql: |
select map(1, 2, 3, 4) as c1
- id: 16
mode: request-unsupport
# this covers basic codec for map data type
sql: |
select
c1[3] as o1, c2[1] as o2, c3['6'] as o3, c4[timestamp(8000)] as o4,
c5[int64(12)] as o5
from (select
map(1, 2, 3, 4) as c1,
map(1, '2', 3, '4') as c2,
map('5', timestamp(8000), '6', timestamp(9000)) as c3,
map(timestamp(8000), date("2012-12-12"), timestamp(9000), date("2014-11-11")) as c4,
map(int64(10), int16(11), int64(12), int16(13)) as c5
)
expect:
columns: ["o1 int", "o2 string", "o3 timestamp", "o4 date", "o5 int16"]
data: |
4, 2, 9000, 2012-12-12, 13
- id: 17
mode: request-unsupport
sql: |
select c1 + 8 from (select 9 as c1)
2 changes: 1 addition & 1 deletion hybridse/examples/toydb/src/tablet/tablet_catalog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ bool TabletTableHandler::Init() {
// init types var
for (int32_t i = 0; i < schema_.size(); i++) {
const type::ColumnDef& column = schema_.Get(i);
codec::ColInfo col_info(column.name(), column.type(), i, 0);
codec::ColInfo col_info(column.name(), column.schema(), i, 0);
types_.insert(std::make_pair(column.name(), col_info));
}

Expand Down
31 changes: 31 additions & 0 deletions hybridse/include/base/fe_status.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,37 @@ static inline std::initializer_list<int> __output_literal_args(STREAM& stream,

#define MAX_STATUS_TRACE_SIZE 4096

// Evaluate and check the expression returns a absl::Status.
// End the current function by return status, if status is not OK
#define CHECK_ABSL_STATUS(expr) \
while (true) { \
auto _s = (expr); \
if (!_s.ok()) { \
return _s; \
} \
break; \
}

// Check the absl::StatusOr<T> object, end the current function
// by return 'object.status()' if it is not OK
#define CHECK_ABSL_STATUSOR(statusor) \
while (true) { \
if (!statusor.ok()) { \
return statusor.status(); \
} \
break; \
}

// Evaluate the expression returns Status, converted and return failed absl status if status not ok
#define CHECK_STATUS_TO_ABSL(expr) \
while (true) { \
auto _status = (expr); \
if (!_status.isOK()) { \
return absl::InternalError(_status.GetMsg()); \
} \
break; \
}

#define CHECK_STATUS(call, ...) \
while (true) { \
auto _status = (call); \
Expand Down
54 changes: 37 additions & 17 deletions hybridse/include/codec/fe_row_codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "absl/status/statusor.h"
#include "base/raw_buffer.h"
#include "butil/iobuf.h"
#include "gflags/gflags.h"
#include "proto/fe_type.pb.h"

namespace hybridse {
Expand All @@ -42,9 +41,18 @@ static constexpr uint32_t UINT24_MAX = (1 << 24) - 1;
const std::string NONETOKEN = "!N@U#L$L%"; // NOLINT
const std::string EMPTY_STRING = "!@#$%"; // NOLINT

// TODO(chendihao): Change to inline function if do not depend on gflags
const std::unordered_map<::hybridse::type::Type, uint8_t>& GetTypeSizeMap();

// return true if the column considered base type in row codec.
// date & timestamp consider base type since they have single field in corresponding llvm struct,
// while string, map and array consider complex type.
//
// for base types, the column is written into row ptr by just writing the value of primitive type,
// for comple type, written is made by a string (or string-like) manner: str size + str data.
// map, array, or any other complex types, takes a extra encoding from their struct value into str data.
bool IsCodecBaseType(const type::ColumnSchema& sc);
bool IsCodecStrLikeType(const type::ColumnSchema& sc);

inline uint8_t GetAddrLength(uint32_t size) {
if (size <= UINT8_MAX) {
return 1;
Expand Down Expand Up @@ -180,26 +188,38 @@ class RowView {
};

struct ColInfo {
::hybridse::type::Type type;
// type is still used in same lagecy udf context,
// cautious use for non-base types
::hybridse::type::Type type() const {
if (!schema.has_base_type()) {
return type::kNull;
}
return schema.base_type();
}

uint32_t idx;
uint32_t offset;
std::string name;
type::ColumnSchema schema;

ColInfo() {}
ColInfo(const std::string& name, ::hybridse::type::Type type, uint32_t idx,
uint32_t offset)
: type(type), idx(idx), offset(offset), name(name) {}
ColInfo(const std::string& name, ::hybridse::type::Type type, uint32_t idx, uint32_t offset)
: idx(idx), offset(offset), name(name) {
schema.set_base_type(type);
}

ColInfo(const std::string& name, const type::ColumnSchema& sc, uint32_t idx, uint32_t offset)
: idx(idx), offset(offset), name(name), schema(sc) {}
};

struct StringColInfo : public ColInfo {
uint32_t str_next_offset;
uint32_t str_start_offset;

StringColInfo() {}
StringColInfo(const std::string& name, ::hybridse::type::Type type,
StringColInfo(const std::string& name, ::hybridse::type::ColumnSchema sc,
uint32_t idx, uint32_t offset, uint32_t str_next_offset,
uint32_t str_start_offset)
: ColInfo(name, type, idx, offset),
: ColInfo(name, sc, idx, offset),
str_next_offset(str_next_offset),
str_start_offset(str_start_offset) {}
};
Expand All @@ -209,7 +229,7 @@ class SliceFormat {
explicit SliceFormat(const hybridse::codec::Schema* schema);
virtual ~SliceFormat() {}

bool GetStringColumnInfo(size_t idx, StringColInfo* res) const;
absl::StatusOr<StringColInfo> GetStringColumnInfo(size_t idx) const;

const ColInfo* GetColumnInfo(size_t idx) const;

Expand All @@ -224,7 +244,7 @@ class SliceFormat {
class RowFormat {
public:
virtual ~RowFormat() {}
virtual bool GetStringColumnInfo(size_t schema_idx, size_t idx, StringColInfo* res) const = 0;
virtual absl::StatusOr<StringColInfo> GetStringColumnInfo(size_t schema_idx, size_t idx) const = 0;
virtual const ColInfo* GetColumnInfo(size_t schema_idx, size_t idx) const = 0;
virtual size_t GetSliceId(size_t schema_idx) const = 0;
};
Expand All @@ -245,8 +265,8 @@ class MultiSlicesRowFormat : public RowFormat {
slice_formats_.clear();
}

bool GetStringColumnInfo(size_t schema_idx, size_t idx, StringColInfo* res) const override {
return slice_formats_[schema_idx].GetStringColumnInfo(idx, res);
absl::StatusOr<StringColInfo> GetStringColumnInfo(size_t schema_idx, size_t idx) const override {
return slice_formats_[schema_idx].GetStringColumnInfo(idx);
}

const ColInfo* GetColumnInfo(size_t schema_idx, size_t idx) const override {
Expand Down Expand Up @@ -287,8 +307,8 @@ class SingleSliceRowFormat : public RowFormat {
}
}

bool GetStringColumnInfo(size_t schema_idx, size_t idx, StringColInfo* res) const override {
return slice_format_->GetStringColumnInfo(offsets_[schema_idx] + idx, res);
absl::StatusOr<StringColInfo> GetStringColumnInfo(size_t schema_idx, size_t idx) const override {
return slice_format_->GetStringColumnInfo(offsets_[schema_idx] + idx);
}

const ColInfo* GetColumnInfo(size_t schema_idx, size_t idx) const override {
Expand Down
3 changes: 3 additions & 0 deletions hybridse/include/codec/type_codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ int32_t AppendString(int8_t* buf_ptr, uint32_t buf_size, uint32_t col_idx,
uint32_t str_start_offset, uint32_t str_field_offset,
uint32_t str_addr_space, uint32_t str_body_offset);

// write `str_offset` in address `str_offset_ptr`, actual written bytes determined by `str_addr_space`
void EncodeStrOffset(int8_t* str_offset_ptr, int32_t str_offset, int32_t str_addr_space);

inline int8_t GetAddrSpace(uint32_t size) {
if (size <= UINT8_MAX) {
return 1;
Expand Down
8 changes: 5 additions & 3 deletions hybridse/src/benchmark/udf_bm_case.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,10 @@ void SumArrayListCol(benchmark::State* state, MODE mode, int64_t data_size,
schemas_context.GetRowFormat(schema_idx)->GetColumnInfo(col_idx);

codegen::MemoryWindowDecodeIRBuilder builder(&schemas_context, nullptr);
node::TypeNode type;
codegen::SchemaType2DataType(info->type, &type);
node::NodeManager nm;
auto rs = codegen::ColumnSchema2Type(info->schema, &nm);
ASSERT_TRUE(rs.ok());
auto* type = rs.value();

uint32_t col_size;
ASSERT_TRUE(codegen::GetLlvmColumnSize(&type, &col_size));
Expand All @@ -193,7 +195,7 @@ void SumArrayListCol(benchmark::State* state, MODE mode, int64_t data_size,

ASSERT_EQ(0, ::hybridse::codec::v1::GetCol(
reinterpret_cast<int8_t*>(&list_table_ref), 0, info->idx,
info->offset, info->type, buf));
info->offset, info->type(), buf));

{
switch (mode) {
Expand Down
1 change: 1 addition & 0 deletions hybridse/src/case/sql_case.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ bool SqlCase::ExtractSchema(const std::vector<std::string>& columns,
}
column->set_type(type);
column->set_is_not_null(false);
column->mutable_schema()->set_base_type(column->type());
}
} catch (const std::exception& ex) {
LOG(WARNING) << "Fail to ExtractSchema: " << ex.what();
Expand Down
Loading

0 comments on commit 7f1bd83

Please sign in to comment.