Skip to content

Commit

Permalink
[#1127] YSQL: Collation Support (part 3)
Browse files Browse the repository at this point in the history
Summary:
So far YSQL has added collation support by always performing collation encoding
on any non-C collation string constant value that is sent to docdb. The
collation encoded result can be memcmp'ed by docdb to achieve the same
comparison semantics such that for two strings s1 and s2, their collation
encoded results are e(s1), and e(s2), we have

  strcoll(s1, s2) == memcmp(e(s1), e(s2)

Docdb is implemented as rocksdb which only performs memcmp on keys, not values.
If a postgres table column is part of a primary key, it will be stored in the
key part of the rocksdbs of the tablets for the table. If a postgres table
column appears in an index, it will also be stored in the key part of the
rocksdbs of the tablets for the index. In both cases, because they are stored in
the key part of a rocksdb, collation-encoding is needed to ensure correct
comparison semantics.

However, if a postgres table column is neither part of a primary key, not it
is used to build any index, then it is a non-key column and will be stored in
the value part of rocksdb. Rocksdb does not perform memcmp on its value part,
therefore performing collation-encoding is not needed. For space efficiency, we
should store the original string value by removing the sortkey from the
collation-encoded string. This diff implements this space optimization via the
following steps:

(1) Added PgDml::GetColumnInfo that for given column so that we can tell whether
the column represents a primary key column. Note that for both YB base table and
YB index table, a column is either a primary key column (that composes the
primary key), or a value column.
(2) At each bind point, for value column, change collation id to InvalidOid as
encoding collation.  This has the effect to disable collation encoding so that
the original PG character string value will be passed to docdb.

Test Plan:
1. Run regression tests with collation disabled (default build).
2. Run regression tests with collation enabled and default database collation is
still "C" (FLAGS_TEST_pg_collation_enabled=true)
3. Run regression tests with collation enabled and set default database
collation to "en_US.UTF-8" (FLAGS_TEST_pg_collation_enabled=true and
kTestOnlyUseOSDefaultCollation=true).

Reviewers: mihnea, dmitry

Reviewed By: dmitry

Subscribers: yql

Differential Revision: https://phabricator.dev.yugabyte.com/D12962
  • Loading branch information
myang2021 committed Sep 21, 2021
1 parent 44a6214 commit ac8c924
Show file tree
Hide file tree
Showing 16 changed files with 84 additions and 33 deletions.
9 changes: 6 additions & 3 deletions src/postgres/src/backend/access/ybc/ybcam.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ static Oid ybc_get_atttypid(TupleDesc bind_desc, AttrNumber attnum)
static void ybcBindColumn(YbScanDesc ybScan, TupleDesc bind_desc, AttrNumber attnum, Datum value, bool is_null)
{
Oid atttypid = ybc_get_atttypid(bind_desc, attnum);
Oid attcollation = ybc_get_attcollation(bind_desc, attnum);
Oid attcollation = YBEncodingCollation(ybScan->handle, attnum,
ybc_get_attcollation(bind_desc, attnum));

YBCPgExpr ybc_expr = YBCNewConstant(ybScan->handle, atttypid, attcollation, value, is_null);

Expand All @@ -174,7 +175,8 @@ static void ybcBindColumnCondBetween(YbScanDesc ybScan, TupleDesc bind_desc, Att
bool start_valid, Datum value, bool end_valid, Datum value_end)
{
Oid atttypid = ybc_get_atttypid(bind_desc, attnum);
Oid attcollation = ybc_get_attcollation(bind_desc, attnum);
Oid attcollation = YBEncodingCollation(ybScan->handle, attnum,
ybc_get_attcollation(bind_desc, attnum));

YBCPgExpr ybc_expr = start_valid ? YBCNewConstant(ybScan->handle,
atttypid,
Expand All @@ -199,7 +201,8 @@ static void ybcBindColumnCondIn(YbScanDesc ybScan, TupleDesc bind_desc, AttrNumb
int nvalues, Datum *values)
{
Oid atttypid = ybc_get_atttypid(bind_desc, attnum);
Oid attcollation = ybc_get_attcollation(bind_desc, attnum);
Oid attcollation = YBEncodingCollation(ybScan->handle, attnum,
ybc_get_attcollation(bind_desc, attnum));

YBCPgExpr ybc_exprs[nvalues]; /* VLA - scratch space */
for (int i = 0; i < nvalues; i++) {
Expand Down
6 changes: 3 additions & 3 deletions src/postgres/src/backend/commands/ybccmds.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ YBTransformPartitionSplitPoints(YBCPgStatement yb_stmt,
/* Create MINVALUE in YugaByte format */
Form_pg_attribute attr = attrs[idx];
exprs[idx] = YBCNewConstantVirtual(yb_stmt, attr->atttypid,
attr->attcollation, YB_YQL_DATUM_LIMIT_MAX);
YB_YQL_DATUM_LIMIT_MAX);
break;
}

Expand All @@ -342,7 +342,7 @@ YBTransformPartitionSplitPoints(YBCPgStatement yb_stmt,
/* Create MINVALUE in YugaByte format */
Form_pg_attribute attr = attrs[idx];
exprs[idx] = YBCNewConstantVirtual(yb_stmt, attr->atttypid,
attr->attcollation, YB_YQL_DATUM_LIMIT_MIN);
YB_YQL_DATUM_LIMIT_MIN);
break;
}
}
Expand All @@ -352,7 +352,7 @@ YBTransformPartitionSplitPoints(YBCPgStatement yb_stmt,
for (; idx < attr_count; idx++) {
Form_pg_attribute attr = attrs[idx];
exprs[idx] = YBCNewConstantVirtual(yb_stmt, attr->atttypid,
attr->attcollation, YB_YQL_DATUM_LIMIT_MIN);
YB_YQL_DATUM_LIMIT_MIN);
}

/* Add the split boundary to CREATE statement */
Expand Down
10 changes: 2 additions & 8 deletions src/postgres/src/backend/executor/ybcExpr.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,10 @@ YBCPgExpr YBCNewConstant(YBCPgStatement ybc_stmt, Oid type_id, Oid collation_id,
return expr;
}

YBCPgExpr YBCNewConstantVirtual(YBCPgStatement ybc_stmt, Oid type_id,
Oid collation_id, YBCPgDatumKind kind) {
YBCPgExpr YBCNewConstantVirtual(YBCPgStatement ybc_stmt, Oid type_id, YBCPgDatumKind kind) {
YBCPgExpr expr = NULL;
const YBCPgTypeEntity *type_entity = YBCDataTypeFromOidMod(InvalidAttrNumber, type_id);
YBCPgCollationInfo collation_info;
YBGetCollationInfo(collation_id, type_entity, 0 /* datum */, true /* is_null */,
&collation_info);
HandleYBStatus(YBCPgNewConstantVirtual(ybc_stmt, type_entity,
collation_info.collate_is_valid_non_c,
kind, &expr));
HandleYBStatus(YBCPgNewConstantVirtual(ybc_stmt, type_entity, kind, &expr));
return expr;
}

Expand Down
16 changes: 11 additions & 5 deletions src/postgres/src/backend/executor/ybcModifyTable.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ static Oid YBCExecuteInsertInternal(Oid dboid,
* Postgres could have also converted both collations to the column
* collation but it appears that collation is not part of a type.
*/
Oid collation_id = ybc_get_attcollation(RelationGetDescr(rel), attnum);
Oid collation_id = YBEncodingCollation(insert_stmt, attnum,
ybc_get_attcollation(RelationGetDescr(rel), attnum));
Datum datum = heap_getattr(tuple, attnum, tupleDesc, &is_null);

/* Check not-null constraint on primary key early */
Expand Down Expand Up @@ -356,7 +357,8 @@ static void PrepareIndexWriteStmt(YBCPgStatement stmt,
for (AttrNumber attnum = 1; attnum <= natts; ++attnum)
{
Oid type_id = GetTypeId(attnum, tupdesc);
Oid collation_id = ybc_get_attcollation(tupdesc, attnum);
Oid collation_id = YBEncodingCollation(stmt, attnum,
ybc_get_attcollation(tupdesc, attnum));
Datum value = values[attnum - 1];
bool is_null = isnull[attnum - 1];
has_null_attr = has_null_attr || is_null;
Expand Down Expand Up @@ -862,7 +864,7 @@ bool YBCExecuteUpdate(Relation rel,
{
bool is_null = false;
Datum d = heap_getattr(tuple, attnum, tupleDesc, &is_null);
int32_t collation_id = att_desc->attcollation;
Oid collation_id = YBEncodingCollation(update_stmt, attnum, att_desc->attcollation);
YBCPgExpr ybc_expr = YBCNewConstant(update_stmt, type_id, collation_id, d, is_null);

HandleYBStatus(YBCPgDmlAssignColumn(update_stmt, attnum, ybc_expr));
Expand Down Expand Up @@ -1014,9 +1016,13 @@ void YBCUpdateSysCatalogTupleForDb(Oid dboid, Relation rel, HeapTuple oldtuple,

bool is_null = false;
Datum d = heap_getattr(tuple, attnum, tupleDesc, &is_null);
/*
* Since we are assign values to non-primary-key columns, pass InvalidOid as
* collation_id to skip computing collation sortkeys.
*/
YBCPgExpr ybc_expr = YBCNewConstant(
update_stmt, TupleDescAttr(tupleDesc, idx)->atttypid,
TupleDescAttr(tupleDesc, idx)->attcollation, d, is_null);
update_stmt, TupleDescAttr(tupleDesc, idx)->atttypid, InvalidOid /* collation_id */,
d, is_null);
HandleYBStatus(YBCPgDmlAssignColumn(update_stmt, attnum, ybc_expr));
}

Expand Down
8 changes: 8 additions & 0 deletions src/postgres/src/backend/utils/misc/pg_yb_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -1786,3 +1786,11 @@ bool YBIsCollationValidNonC(Oid collation_id) {
is_valid_non_c = true;
return is_valid_non_c;
}

Oid YBEncodingCollation(YBCPgStatement handle, int attr_num, Oid attcollation) {
if (attcollation == InvalidOid)
return InvalidOid;
YBCPgColumnInfo column_info = {false, false};
HandleYBStatus(YBCPgDmlGetColumnInfo(handle, attr_num, &column_info));
return column_info.is_primary ? attcollation : InvalidOid;
}
2 changes: 1 addition & 1 deletion src/postgres/src/include/executor/ybcExpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ extern YBCPgExpr YBCNewConstant(YBCPgStatement ybc_stmt, Oid type_id,

// Construct virtual constant expression using the given datatype "type_id" and virtual "datum".
extern YBCPgExpr YBCNewConstantVirtual(YBCPgStatement ybc_stmt, Oid type_id,
Oid collation_id, YBCPgDatumKind kind);
YBCPgDatumKind kind);

// Construct a generic eval_expr call for given a PG Expr and its expected type and attno.
extern YBCPgExpr YBCNewEvalSingleParamExprCall(YBCPgStatement ybc_stmt,
Expand Down
9 changes: 9 additions & 0 deletions src/postgres/src/include/pg_yb_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,13 @@ void YBSetupAttrCollationInfo(YBCPgAttrValueDescriptor *attr);
*/
bool YBIsCollationValidNonC(Oid collation_id);

/*
* For the column 'attr_num' and its collation id, return the collation id that
* will be used to do collation encoding. For example, if the column 'attr_num'
* represents a non-key column, we do not need to store the collation key and
* this function will return InvalidOid which will disable collation encoding
* for the column string value.
*/
Oid YBEncodingCollation(YBCPgStatement handle, int attr_num, Oid attcollation);

#endif /* PG_YB_UTILS_H */
4 changes: 3 additions & 1 deletion src/yb/docdb/primitive_value.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,9 @@ string PrimitiveValue::ToValue() const {
case ValueType::kRedisSet: return result;

case ValueType::kCollStringDescending: FALLTHROUGH_INTENDED;
case ValueType::kCollString: FALLTHROUGH_INTENDED;
case ValueType::kCollString:
LOG(DFATAL) << "collation encoded string found for docdb value";
FALLTHROUGH_INTENDED;
case ValueType::kStringDescending: FALLTHROUGH_INTENDED;
case ValueType::kString:
// No zero encoding necessary when storing the string in a value.
Expand Down
10 changes: 10 additions & 0 deletions src/yb/yql/pggate/pg_dml.cc
Original file line number Diff line number Diff line change
Expand Up @@ -382,5 +382,15 @@ bool PgDml::has_aggregate_targets() {
return num_aggregate_targets > 0;
}

Result<YBCPgColumnInfo> PgDml::GetColumnInfo(int attr_num) const {
if (secondary_index_query_) {
return secondary_index_query_->GetColumnInfo(attr_num);
}
YBCPgColumnInfo column_info = {false, false};
RETURN_NOT_OK(bind_->GetColumnInfo(
attr_num, &column_info.is_primary, &column_info.is_hash));
return column_info;
}

} // namespace pggate
} // namespace yb
4 changes: 4 additions & 0 deletions src/yb/yql/pggate/pg_dml.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ class PgDml : public PgStatement {

virtual void SetCatalogCacheVersion(uint64_t catalog_cache_version) = 0;

// Get column info on whether the column 'attr_num' is a hash key, a range
// key, or neither.
Result<YBCPgColumnInfo> GetColumnInfo(int attr_num) const;

bool has_aggregate_targets();

bool has_doc_op() {
Expand Down
9 changes: 4 additions & 5 deletions src/yb/yql/pggate/pg_expr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,10 @@ void PgExpr::TranslateCollateText(

DCHECK(text_len >= 0 && text[text_len] == '\0')
<< "Data received from DocDB does not have expected format";
int8_t first_byte = text[0];
if (first_byte != '\0') {
// We may get the original value directly from DocDB. Remove this FATAL
// when DocDB can do that.
LOG(FATAL) << "String is not collation encoded: " << text;
const bool is_original_value = (text_len == 0 || text[0] != '\0');
if (is_original_value) {
// This means that we have done storage space optimization to only store the
// original value for non-key columns.
pg_tuple->WriteDatum(index, type_entity->yb_to_datum(text, text_len, type_attrs));
} else {
// This is a collation encoded string, we need to fetch the original value.
Expand Down
8 changes: 6 additions & 2 deletions src/yb/yql/pggate/pggate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,10 @@ Status PgApiImpl::DmlBindTable(PgStatement *handle) {
return down_cast<PgDml*>(handle)->BindTable();
}

Result<YBCPgColumnInfo> PgApiImpl::DmlGetColumnInfo(YBCPgStatement handle, int attr_num) {
return down_cast<PgDml*>(handle)->GetColumnInfo(attr_num);
}

CHECKED_STATUS PgApiImpl::DmlAssignColumn(PgStatement *handle, int attr_num, PgExpr *attr_value) {
return down_cast<PgDml*>(handle)->AssignColumn(attr_num, attr_value);
}
Expand Down Expand Up @@ -1258,14 +1262,14 @@ Status PgApiImpl::NewConstant(
}

Status PgApiImpl::NewConstantVirtual(
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity,
YBCPgDatumKind datum_kind, YBCPgExpr *expr_handle) {
if (!stmt) {
// Invalid handle.
return STATUS(InvalidArgument, "Invalid statement handle");
}
PgExpr::SharedPtr pg_const =
make_shared<PgConstant>(type_entity, collate_is_valid_non_c, datum_kind);
make_shared<PgConstant>(type_entity, false /* collate_is_valid_non_c */, datum_kind);
stmt->AddExpr(pg_const);

*expr_handle = pg_const.get();
Expand Down
5 changes: 4 additions & 1 deletion src/yb/yql/pggate/pggate.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ class PgApiImpl {
// Binding Tables: Bind the whole table in a statement. Do not use with BindColumn.
CHECKED_STATUS DmlBindTable(YBCPgStatement handle);

// Utility method to get the info for column 'attr_num'.
Result<YBCPgColumnInfo> DmlGetColumnInfo(YBCPgStatement handle, int attr_num);

// API for SET clause.
CHECKED_STATUS DmlAssignColumn(YBCPgStatement handle, int attr_num, YBCPgExpr attr_value);

Expand Down Expand Up @@ -491,7 +494,7 @@ class PgApiImpl {
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
const char *collation_sortkey, uint64_t datum, bool is_null, YBCPgExpr *expr_handle);
CHECKED_STATUS NewConstantVirtual(
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity,
YBCPgDatumKind datum_kind, YBCPgExpr *expr_handle);
CHECKED_STATUS NewConstantOp(
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
Expand Down
5 changes: 5 additions & 0 deletions src/yb/yql/pggate/ybc_pg_typedefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,11 @@ typedef struct PgServerDescriptor {
uint16_t pgPort;
} YBCServerDescriptor;

typedef struct PgColumnInfo {
bool is_primary;
bool is_hash;
} YBCPgColumnInfo;

#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
Expand Down
9 changes: 6 additions & 3 deletions src/yb/yql/pggate/ybc_pggate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,10 @@ YBCStatus YBCPgDmlBindTable(YBCPgStatement handle) {
return ToYBCStatus(pgapi->DmlBindTable(handle));
}

YBCStatus YBCPgDmlGetColumnInfo(YBCPgStatement handle, int attr_num, YBCPgColumnInfo* column_info) {
return ExtractValueFromResult(pgapi->DmlGetColumnInfo(handle, attr_num), column_info);
}

YBCStatus YBCPgDmlAssignColumn(YBCPgStatement handle,
int attr_num,
YBCPgExpr attr_value) {
Expand Down Expand Up @@ -766,10 +770,9 @@ YBCStatus YBCPgNewConstant(
}

YBCStatus YBCPgNewConstantVirtual(
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity,
YBCPgDatumKind datum_kind, YBCPgExpr *expr_handle) {
return ToYBCStatus(pgapi->NewConstantVirtual(
stmt, type_entity, collate_is_valid_non_c, datum_kind, expr_handle));
return ToYBCStatus(pgapi->NewConstantVirtual(stmt, type_entity, datum_kind, expr_handle));
}

YBCStatus YBCPgNewConstantOp(
Expand Down
3 changes: 2 additions & 1 deletion src/yb/yql/pggate/ybc_pggate.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ YBCStatus YBCPgDmlBindColumnCondBetween(YBCPgStatement handle, int attr_num, YBC
YBCPgExpr attr_value_end);
YBCStatus YBCPgDmlBindColumnCondIn(YBCPgStatement handle, int attr_num, int n_attr_values,
YBCPgExpr *attr_values);
YBCStatus YBCPgDmlGetColumnInfo(YBCPgStatement handle, int attr_num, YBCPgColumnInfo* info);

// Binding Tables: Bind the whole table in a statement. Do not use with BindColumn.
YBCStatus YBCPgDmlBindTable(YBCPgStatement handle);
Expand Down Expand Up @@ -456,7 +457,7 @@ YBCStatus YBCPgNewConstant(
const char *collation_sortkey, uint64_t datum, bool is_null, YBCPgExpr *expr_handle);
// Construct a virtual constant value.
YBCStatus YBCPgNewConstantVirtual(
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity, bool collate_is_valid_non_c,
YBCPgStatement stmt, const YBCPgTypeEntity *type_entity,
YBCPgDatumKind datum_kind, YBCPgExpr *expr_handle);
// Construct an operator expression on a constant.
YBCStatus YBCPgNewConstantOp(
Expand Down

0 comments on commit ac8c924

Please sign in to comment.