From 2422994de04cf4f5a989fec0f00fabccad15b03f Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Thu, 15 Feb 2024 01:16:16 +0900 Subject: [PATCH] GH-39463: [C++] Support cast kernel from large string, (large) binary to dictionary (#40017) ### Rationale for this change Support `cast` kernel from large string(`large_utf8()`, (large) binary(`binary()`, `large_binary()`) to `dictionary` ### What changes are included in this PR? - Support `cast` kernel - from large string(`large_utf8()`) to `dictionary` - from binary(`binary()`) to `dictionary` - from large binary(`large_binary()`) to `dictionary` ### Are these changes tested? Yes. It is passed by existing test cases. ### Are there any user-facing changes? No. * Closes: #39463 Authored-by: Hyunseok Seo Signed-off-by: Felipe Oliveira Carvalho --- .../compute/kernels/scalar_cast_dictionary.cc | 14 +++-- cpp/src/arrow/scalar_test.cc | 56 ++++++++++--------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index f13aa26d969c1..ae88ef1cb7534 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -45,11 +45,12 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o return Status::OK(); } - // If the input type is STRING, it is first encoded as a dictionary to facilitate - // processing. This approach allows the subsequent code to uniformly handle STRING - // inputs as if they were originally provided in dictionary format. Encoding as a - // dictionary helps in reusing the same logic for dictionary operations. - if (batch[0].type()->id() == Type::STRING) { + // If the input type is string or binary-like, it is first encoded as a dictionary to + // facilitate processing. This approach allows the subsequent code to uniformly handle + // string or binary-like inputs as if they were originally provided in dictionary + // format. Encoding as a dictionary helps in reusing the same logic for dictionary + // operations. + if (is_base_binary_like(in_array->type->id())) { in_array = DictionaryEncode(in_array)->array(); } const auto& in_type = checked_cast(*in_array->type); @@ -98,6 +99,9 @@ std::vector> GetDictionaryCasts() { AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get()); AddDictionaryCast(cast_dict.get()); AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); return {cast_dict}; } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index d9fb3feaeea6e..09dfde3227109 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1482,33 +1482,35 @@ TEST(TestDictionaryScalar, ValidateErrors) { TEST(TestDictionaryScalar, Cast) { for (auto index_ty : all_dictionary_index_types()) { - auto ty = dictionary(index_ty, utf8()); - auto dict = checked_pointer_cast( - ArrayFromJSON(utf8(), R"(["alpha", null, "gamma"])")); - - for (int64_t i = 0; i < dict->length(); ++i) { - auto alpha = - dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : MakeNullScalar(utf8()); - // Cast string to dict(..., string) - ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty)); - const auto& cast_alpha = cast_alpha_datum.scalar(); - ASSERT_OK(cast_alpha->ValidateFull()); - ASSERT_OK_AND_ASSIGN( - auto roundtripped_alpha, - checked_cast(*cast_alpha).GetEncodedValue()); - - ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); - auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); - ASSERT_OK(alpha_dict.ValidateFull()); - ASSERT_OK_AND_ASSIGN( - auto encoded_alpha, - checked_cast(alpha_dict).GetEncodedValue()); - - AssertScalarsEqual(*alpha, *roundtripped_alpha); - AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); - - // dictionaries differ, though encoded values are identical - ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); + for (auto value_ty : {utf8(), large_utf8(), binary(), large_binary()}) { + auto ty = dictionary(index_ty, value_ty); + auto dict = ArrayFromJSON(value_ty, R"(["alpha", null, "gamma"])"); + ASSERT_OK(dict->ValidateFull()); + + for (int64_t i = 0; i < dict->length(); ++i) { + ASSERT_OK_AND_ASSIGN(auto alpha, dict->GetScalar(i)); + + // Cast string to dict(..., string) + ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty)); + const auto& cast_alpha = cast_alpha_datum.scalar(); + ASSERT_OK(cast_alpha->ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto roundtripped_alpha, + checked_cast(*cast_alpha).GetEncodedValue()); + + ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); + auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); + ASSERT_OK(alpha_dict.ValidateFull()); + ASSERT_OK_AND_ASSIGN( + auto encoded_alpha, + checked_cast(alpha_dict).GetEncodedValue()); + + AssertScalarsEqual(*alpha, *roundtripped_alpha); + AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); + + // dictionaries differ, though encoded values are identical + ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); + } } } }