From 63449de4c078b24082a2dc75733a52f420c417d9 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Jun 2024 08:34:55 -0700 Subject: [PATCH 1/7] Migrate lists combine to pylibcudf --- python/cudf/cudf/_lib/lists.pyx | 20 +++----------- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 3 +++ python/cudf/cudf/_lib/pylibcudf/lists.pyx | 26 +++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_lists.py | 19 ++++++++++++++ 4 files changed, 52 insertions(+), 16 deletions(-) create mode 100644 python/cudf/cudf/pylibcudf_tests/test_lists.py diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 656d92c1a4b..a424a6221ed 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -12,7 +12,6 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, concatenate_null_policy, - concatenate_rows as cpp_concatenate_rows, ) from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( contains, @@ -32,7 +31,6 @@ from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( distinct as cpp_distinct, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, null_equality, @@ -41,10 +39,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( size_type, ) from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf @@ -223,16 +218,9 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): - cdef unique_ptr[column] c_result - - cdef table_view c_table_view = table_view_from_columns(source_columns) - - with nogil: - c_result = move(cpp_concatenate_rows( - c_table_view, - )) - - return Column.from_unique_ptr(move(c_result)) + return pylibcudf.lists.concatenate_rows( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index b780d299977..6de327f36a9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -2,7 +2,10 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table cpdef Table explode_outer(Table, size_type explode_column_idx) + +cpdef Column concatenate_rows(Table) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 654f39742b6..ff65c391c01 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -3,10 +3,15 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode +from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( + concatenate_rows as cpp_concatenate_rows, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from .column cimport Column from .table cimport Table @@ -33,3 +38,24 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) return Table.from_libcudf(move(c_result)) + + +cpdef Column concatenate_rows(Table input): + """Row-wise concatenating multiple lists columns into a single lists column. + + Parameters + ---------- + input : Table + The input table + + Returns + ------- + Table + A new Column of concatenated rows + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_rows(input.view())) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py new file mode 100644 index 00000000000..59bd9854c73 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def test_concatenate_rows(): + test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]] + + arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"]) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + + res = plc.lists.concatenate_rows(plc_tbl) + + expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) + + assert_column_eq(res, expect) From f6c831b0f45f1f90c64a6413aba9b7335e47852f Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Jun 2024 12:18:28 -0700 Subject: [PATCH 2/7] Add concatenate_list_elements --- python/cudf/cudf/_lib/lists.pyx | 30 +++++++----------- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 5 +++ python/cudf/cudf/_lib/pylibcudf/lists.pyx | 31 +++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_lists.py | 13 ++++++++ 4 files changed, 60 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index a424a6221ed..5d406f5c85f 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,10 +9,6 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( - concatenate_list_elements as cpp_concatenate_list_elements, - concatenate_null_policy, -) from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( contains, index_of as cpp_index_of, @@ -218,24 +214,20 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): - return pylibcudf.lists.concatenate_rows( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]) + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_rows( + pylibcudf.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]) + ) ) @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): - cdef concatenate_null_policy policy = ( - concatenate_null_policy.IGNORE if dropna - else concatenate_null_policy.NULLIFY_OUTPUT_ROW + return Column.from_pylibcudf( + pylibcudf.lists.concatenate_list_elements( + input_column.to_pylibcudf(mode="read"), + dropna, + ) ) - cdef column_view c_input = input_column.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_concatenate_list_elements( - c_input, - policy - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 6de327f36a9..7787bce376f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -1,5 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + +from cudf._lib.pylibcudf.libcudf.lists.combine cimport concatenate_null_policy from cudf._lib.pylibcudf.libcudf.types cimport size_type from .column cimport Column @@ -9,3 +12,5 @@ from .table cimport Table cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) + +cpdef Column concatenate_list_elements(Column, bool dropna) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index ff65c391c01..618e001b9f3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -1,11 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( + concatenate_list_elements as cpp_concatenate_list_elements, + concatenate_null_policy, concatenate_rows as cpp_concatenate_rows, ) from cudf._lib.pylibcudf.libcudf.table.table cimport table @@ -59,3 +62,31 @@ cpdef Column concatenate_rows(Table input): c_result = move(cpp_concatenate_rows(input.view())) return Column.from_libcudf(move(c_result)) + + +cpdef Column concatenate_list_elements(Column input, bool dropna): + """Concatenating multiple lists on the same row into a single list. + + Parameters + ---------- + input : Column + The input column + + Returns + ------- + Column + A new Column of concatenated list elements + """ + cdef concatenate_null_policy null_policy = ( + concatenate_null_policy.IGNORE if dropna + else concatenate_null_policy.NULLIFY_OUTPUT_ROW + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_concatenate_list_elements( + input.view(), + null_policy, + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 59bd9854c73..69fc489e444 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -17,3 +17,16 @@ def test_concatenate_rows(): expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) assert_column_eq(res, expect) + + +def test_concatenate_list_elements(): + test_data = [[[1, 2], [3, 4], [5]], [[6], [], [7, 8, 9]]] + + arr = pa.array(test_data) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.concatenate_list_elements(plc_column, False) + + expect = pa.array([[1, 2, 3, 4, 5], [6, 7, 8, 9]]) + + assert_column_eq(res, expect) From 322dd54c1bcc28728a3171cb01a7da45379cb999 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Jun 2024 13:16:19 -0700 Subject: [PATCH 3/7] Add test for null_policy --- .../cudf/cudf/pylibcudf_tests/test_lists.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 69fc489e444..37fe41dee2c 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa +import pytest from utils import assert_column_eq from cudf._lib import pylibcudf as plc @@ -19,14 +20,27 @@ def test_concatenate_rows(): assert_column_eq(res, expect) -def test_concatenate_list_elements(): - test_data = [[[1, 2], [3, 4], [5]], [[6], [], [7, 8, 9]]] - +@pytest.mark.parametrize( + "test_data, drop_na, expected", + [ + ( + [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], + False, + [[1, 2, 3, 4, 5], None], + ), + ( + [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]], + True, + [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]], + ), + ], +) +def test_concatenate_list_elements(test_data, drop_na, expected): arr = pa.array(test_data) plc_column = plc.interop.from_arrow(arr) - res = plc.lists.concatenate_list_elements(plc_column, False) + res = plc.lists.concatenate_list_elements(plc_column, drop_na) - expect = pa.array([[1, 2, 3, 4, 5], [6, 7, 8, 9]]) + expect = pa.array(expected) assert_column_eq(res, expect) From bdea0bfbaf0d79c2fad922215161bc2d1bdf780c Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 6 Jun 2024 12:01:24 -0700 Subject: [PATCH 4/7] Address comments --- python/cudf/cudf/_lib/pylibcudf/lists.pxd | 1 - python/cudf/cudf/_lib/pylibcudf/lists.pyx | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 7787bce376f..2d2a5b2a9ea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -2,7 +2,6 @@ from libcpp cimport bool -from cudf._lib.pylibcudf.libcudf.lists.combine cimport concatenate_null_policy from cudf._lib.pylibcudf.libcudf.types cimport size_type from .column cimport Column diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 618e001b9f3..e46f76ab5c5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -76,6 +76,9 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ------- Column A new Column of concatenated list elements + dropna : bool + If true, null list elements will be ignored + from concatenation. Otherwise the row will be nulled out. """ cdef concatenate_null_policy null_policy = ( concatenate_null_policy.IGNORE if dropna From 554f4fe709f6cf4d35b56e9eda7485298c677155 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 11 Jun 2024 14:22:49 -0700 Subject: [PATCH 5/7] Address comments --- python/cudf/cudf/_lib/pylibcudf/lists.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index e46f76ab5c5..069c9da31c2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -44,7 +44,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cpdef Column concatenate_rows(Table input): - """Row-wise concatenating multiple lists columns into a single lists column. + """Concatenate multiple lists columns into a single lists column row-wise. Parameters ---------- @@ -65,7 +65,7 @@ cpdef Column concatenate_rows(Table input): cpdef Column concatenate_list_elements(Column input, bool dropna): - """Concatenating multiple lists on the same row into a single list. + """Concatenate multiple lists on the same row into a single list. Parameters ---------- @@ -78,7 +78,8 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): A new Column of concatenated list elements dropna : bool If true, null list elements will be ignored - from concatenation. Otherwise the row will be nulled out. + from concatenation. Otherwise any input null values will result in + the corresponding output row being set to null. """ cdef concatenate_null_policy null_policy = ( concatenate_null_policy.IGNORE if dropna From 4b9d91ad3f2d405c0779b93c753b29b78f44bd2d Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 11 Jun 2024 14:43:49 -0700 Subject: [PATCH 6/7] address comment --- python/cudf/cudf/pylibcudf_tests/test_lists.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 37fe41dee2c..cd393c0a1a9 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -17,7 +17,7 @@ def test_concatenate_rows(): expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) - assert_column_eq(res, expect) + assert_column_eq(expect, res) @pytest.mark.parametrize( @@ -43,4 +43,4 @@ def test_concatenate_list_elements(test_data, drop_na, expected): expect = pa.array(expected) - assert_column_eq(res, expect) + assert_column_eq(expect, res) From 22ee36aa9a2920551294ea0424100805b2a1d574 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 11 Jun 2024 15:22:26 -0700 Subject: [PATCH 7/7] dropna --- python/cudf/cudf/pylibcudf_tests/test_lists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index cd393c0a1a9..b21af8ea11c 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -21,7 +21,7 @@ def test_concatenate_rows(): @pytest.mark.parametrize( - "test_data, drop_na, expected", + "test_data, dropna, expected", [ ( [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]], @@ -35,11 +35,11 @@ def test_concatenate_rows(): ), ], ) -def test_concatenate_list_elements(test_data, drop_na, expected): +def test_concatenate_list_elements(test_data, dropna, expected): arr = pa.array(test_data) plc_column = plc.interop.from_arrow(arr) - res = plc.lists.concatenate_list_elements(plc_column, drop_na) + res = plc.lists.concatenate_list_elements(plc_column, dropna) expect = pa.array(expected)