From ebf079a7f637f93f6ca6cdde3588cc82d6e57254 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 28 Jun 2022 14:01:12 -0700 Subject: [PATCH 01/40] Add test for Spark list-of-list hashing. --- cpp/tests/hashing/hash_test.cpp | 97 +++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 1f3e6567253..43bd70a24f2 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -736,6 +736,103 @@ TEST_F(SparkMurmurHash3Test, StringsWithSeed) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity); } +TEST_F(SparkMurmurHash3Test, ListValues) +{ + /* + import org.apache.spark.sql.functions._ + import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType} + import org.apache.spark.sql.Row + + val schema = new StructType() + .add("lists",ArrayType(ArrayType(IntegerType))) + + val data = Seq( + Row(null), + Row(List(null)), + Row(List(List())), + Row(List(List(1))), + Row(List(List(1, 2))), + Row(List(List(1, 2, 3))), + Row(List(List(1, 2), List(3))), + Row(List(List(1), List(2, 3))), + Row(List(List(1), List(null, 2, 3))), + Row(List(List(1, 2), List(3), List(null))), + Row(List(List(1, 2), null, List(3))), + ) + + val df = spark.createDataFrame( + spark.sparkContext.parallelize(data), schema) + + val df2 = df.selectExpr("lists", "hash(lists) as hash") + df2.printSchema() + df2.show(false) + */ + + /* + child data: 1, 1, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, null, 2, 3, 1, 2, 3, null, 1, 2, 3 + 23 items + validity: i != 13, i != 19 + + parent validity: i != 0 && i != 15 + offsets: 0, 0, 1, 3, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 22, 23 + 16 items + + row validity: i != 0 + row offsets: 0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16 + 11 items + */ + + auto const null = -1; + auto child_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); + auto parent_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i != 15; }); + auto row_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); + auto list1 = cudf::test::lists_column_wrapper({{}, + {1}, + {1, 2}, + {1, 2, 3}, + {1, 2}, + {3}, + {1}, + {2, 3}, + {1}, + {{null, 2, 3}, inner_validity}, + {1, 2}, + {3}, + {{null}, inner_validity}, + {1, 2}, + {}, + {3}}, + parent_validity); + auto offsets = + cudf::test::fixed_width_column_wrapper{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16}; + auto list_validity = std::vector{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto list_validity_buffer = + cudf::test::detail::make_null_mask(list_validity.begin(), list_validity.end()); + auto list_column = cudf::make_lists_column(11, + offsets.release(), + list1.release(), + cudf::UNKNOWN_NULL_COUNT, + std::move(list_validity_buffer)); + + auto expect = cudf::test::fixed_width_column_wrapper{42, + 42, + 42, + -559580957, + -222940379, + -912918097, + -912918097, + -912918097, + -912918097, + -912918097, + -912918097}; + + auto output = cudf::hash(cudf::table_view({*list_column})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); +} + TEST_F(SparkMurmurHash3Test, ListThrows) { lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); From 5632e7cbb1d190e0112df394e07b105582a20707 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Jul 2022 08:53:40 -0700 Subject: [PATCH 02/40] Improve test. --- cpp/tests/hashing/hash_test.cpp | 52 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 43bd70a24f2..1c8d7ae2df5 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -773,7 +773,7 @@ TEST_F(SparkMurmurHash3Test, ListValues) 23 items validity: i != 13, i != 19 - parent validity: i != 0 && i != 15 + nested validity: i != 0 && i != 15 offsets: 0, 0, 1, 3, 6, 8, 9, 10, 12, 13, 16, 18, 19, 20, 22, 23 16 items @@ -785,35 +785,33 @@ TEST_F(SparkMurmurHash3Test, ListValues) auto const null = -1; auto child_validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); - auto parent_validity = + auto nested_validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i != 15; }); - auto row_validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); - auto list1 = cudf::test::lists_column_wrapper({{}, - {1}, - {1, 2}, - {1, 2, 3}, - {1, 2}, - {3}, - {1}, - {2, 3}, - {1}, - {{null, 2, 3}, inner_validity}, - {1, 2}, - {3}, - {{null}, inner_validity}, - {1, 2}, - {}, - {3}}, - parent_validity); + auto nested_list = cudf::test::lists_column_wrapper({{}, + {1}, + {1, 2}, + {1, 2, 3}, + {1, 2}, + {3}, + {1}, + {2, 3}, + {1}, + {{null, 2, 3}, child_validity}, + {1, 2}, + {3}, + {{null}, child_validity}, + {1, 2}, + {}, + {3}}, + nested_validity); auto offsets = cudf::test::fixed_width_column_wrapper{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16}; - auto list_validity = std::vector{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - auto list_validity_buffer = - cudf::test::detail::make_null_mask(list_validity.begin(), list_validity.end()); - auto list_column = cudf::make_lists_column(11, + auto list_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); + auto list_validity_buffer = cudf::test::detail::make_null_mask(list_validity, list_validity + 11); + auto list_column = cudf::make_lists_column(11, offsets.release(), - list1.release(), + nested_list.release(), cudf::UNKNOWN_NULL_COUNT, std::move(list_validity_buffer)); @@ -829,7 +827,7 @@ TEST_F(SparkMurmurHash3Test, ListValues) -912918097, -912918097}; - auto output = cudf::hash(cudf::table_view({*list_column})); + auto output = cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_SPARK_MURMUR3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); } From 6e55198ed18a70d14f478855512af156e09d0c46 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 18 Jul 2022 22:04:43 -0700 Subject: [PATCH 03/40] Copy experimental row hasher for modification. --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/detail/hashing.hpp | 6 + cpp/src/hash/hashing.cu | 3 +- cpp/src/hash/spark_murmur_hash.cu | 287 ++++++++++++++++++++++++++++ 4 files changed, 295 insertions(+), 2 deletions(-) create mode 100644 cpp/src/hash/spark_murmur_hash.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 86bfdc1444b..55ef34d07fc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -299,6 +299,7 @@ add_library( src/hash/hashing.cu src/hash/md5_hash.cu src/hash/murmur_hash.cu + src/hash/spark_murmur_hash.cu src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 29522764dad..8b0ea42658f 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -44,6 +44,12 @@ std::unique_ptr murmur_hash3_32( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr spark_murmur_hash3_32( + table_view const& input, + uint32_t seed = cudf::DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + template