Skip to content

Commit

Permalink
GH-39463: [C++] Support cast kernel from large string, (large) binary…
Browse files Browse the repository at this point in the history
… to dictionary (#40017)

### Rationale for this change

Support `cast` kernel from large string(`large_utf8()`, (large) binary(`binary()`, `large_binary()`) to `dictionary`

### What changes are included in this PR?

- Support `cast` kernel 
  - from large string(`large_utf8()`) to `dictionary`
  - from binary(`binary()`) to `dictionary`
  - from large binary(`large_binary()`) to `dictionary`

### Are these changes tested?

Yes. It is passed by existing test cases.

### Are there any user-facing changes?

No.

* Closes: #39463

Authored-by: Hyunseok Seo <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
  • Loading branch information
llama90 authored Feb 14, 2024
1 parent 91bf1c9 commit 2422994
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 32 deletions.
14 changes: 9 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o
return Status::OK();
}

// If the input type is STRING, it is first encoded as a dictionary to facilitate
// processing. This approach allows the subsequent code to uniformly handle STRING
// inputs as if they were originally provided in dictionary format. Encoding as a
// dictionary helps in reusing the same logic for dictionary operations.
if (batch[0].type()->id() == Type::STRING) {
// If the input type is string or binary-like, it is first encoded as a dictionary to
// facilitate processing. This approach allows the subsequent code to uniformly handle
// string or binary-like inputs as if they were originally provided in dictionary
// format. Encoding as a dictionary helps in reusing the same logic for dictionary
// operations.
if (is_base_binary_like(in_array->type->id())) {
in_array = DictionaryEncode(in_array)->array();
}
const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
Expand Down Expand Up @@ -98,6 +99,9 @@ std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get());
AddDictionaryCast<DictionaryType>(cast_dict.get());
AddDictionaryCast<StringType>(cast_dict.get());
AddDictionaryCast<LargeStringType>(cast_dict.get());
AddDictionaryCast<BinaryType>(cast_dict.get());
AddDictionaryCast<LargeBinaryType>(cast_dict.get());

return {cast_dict};
}
Expand Down
56 changes: 29 additions & 27 deletions cpp/src/arrow/scalar_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1482,33 +1482,35 @@ TEST(TestDictionaryScalar, ValidateErrors) {

TEST(TestDictionaryScalar, Cast) {
for (auto index_ty : all_dictionary_index_types()) {
auto ty = dictionary(index_ty, utf8());
auto dict = checked_pointer_cast<StringArray>(
ArrayFromJSON(utf8(), R"(["alpha", null, "gamma"])"));

for (int64_t i = 0; i < dict->length(); ++i) {
auto alpha =
dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : MakeNullScalar(utf8());
// Cast string to dict(..., string)
ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
const auto& cast_alpha = cast_alpha_datum.scalar();
ASSERT_OK(cast_alpha->ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto roundtripped_alpha,
checked_cast<const DictionaryScalar&>(*cast_alpha).GetEncodedValue());

ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
ASSERT_OK(alpha_dict.ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto encoded_alpha,
checked_cast<const DictionaryScalar&>(alpha_dict).GetEncodedValue());

AssertScalarsEqual(*alpha, *roundtripped_alpha);
AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);

// dictionaries differ, though encoded values are identical
ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
for (auto value_ty : {utf8(), large_utf8(), binary(), large_binary()}) {
auto ty = dictionary(index_ty, value_ty);
auto dict = ArrayFromJSON(value_ty, R"(["alpha", null, "gamma"])");
ASSERT_OK(dict->ValidateFull());

for (int64_t i = 0; i < dict->length(); ++i) {
ASSERT_OK_AND_ASSIGN(auto alpha, dict->GetScalar(i));

// Cast string to dict(..., string)
ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty));
const auto& cast_alpha = cast_alpha_datum.scalar();
ASSERT_OK(cast_alpha->ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto roundtripped_alpha,
checked_cast<const DictionaryScalar&>(*cast_alpha).GetEncodedValue());

ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i));
auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty);
ASSERT_OK(alpha_dict.ValidateFull());
ASSERT_OK_AND_ASSIGN(
auto encoded_alpha,
checked_cast<const DictionaryScalar&>(alpha_dict).GetEncodedValue());

AssertScalarsEqual(*alpha, *roundtripped_alpha);
AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha);

// dictionaries differ, though encoded values are identical
ASSERT_FALSE(alpha_dict.Equals(*cast_alpha));
}
}
}
}
Expand Down

0 comments on commit 2422994

Please sign in to comment.