From f30ea0a7d12625a755bb5726e7514dfdf12094d6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:37:56 -0400 Subject: [PATCH 01/48] Use offsetalator in strings shift functor (#15870) Replaces hardcoded `size_type` used for offset values in the `shift_chars_fn` functor with offsetalator. Follow on to #15630 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15870 --- cpp/src/strings/copying/shift.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 5bba4855390..b386c0860d1 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -67,9 +67,9 @@ struct shift_chars_fn { if (offset < 0) { auto const last_index = -offset; if (idx < last_index) { - auto const first_index = - offset + d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset() + d_column.size()); + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const first_index = offset + off_itr[d_column.offset() + d_column.size()]; return d_column.head()[idx + first_index]; } else { auto const char_index = idx - last_index; @@ -79,9 +79,9 @@ struct shift_chars_fn { if (idx < offset) { return d_filler.data()[idx % d_filler.size_bytes()]; } else { - return d_column.head()[idx - offset + - d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset())]; + auto const offsets = d_column.child(strings_column_view::offsets_column_index); + auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + return d_column.head()[idx - offset + off_itr[d_column.offset()]]; } } } From 90b3094f8a5a12b029a156cf484b673b589d2fec Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:52:46 -0700 Subject: [PATCH 02/48] Clean up pylibcudf test assertations (#15892) Swap the order of result,expected to expected, result for assert_table_eq too Fix a few places where result,expected was swapped for assert_column_eq Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15892 --- python/cudf/cudf/pylibcudf_tests/common/utils.py | 2 +- python/cudf/cudf/pylibcudf_tests/test_copying.py | 14 +++++++------- python/cudf/cudf/pylibcudf_tests/test_reshape.py | 4 ++-- .../cudf/pylibcudf_tests/test_string_capitalize.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 0befb3bb3e8..e00053529a8 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -54,7 +54,7 @@ def assert_column_eq( assert lhs.equals(rhs) -def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: +def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: """Verify that a pylibcudf table and PyArrow table are equal.""" plc_shape = (plc_table.num_rows(), plc_table.num_columns()) assert plc_shape == pa_table.shape diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index ef70869a145..cd70ce4abf5 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -138,7 +138,7 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_gather_map_has_nulls(target_table): @@ -240,7 +240,7 @@ def test_scatter_table( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_table_num_col_mismatch( @@ -315,7 +315,7 @@ def test_scatter_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_scatter_scalars_num_scalars_mismatch( @@ -574,7 +574,7 @@ def test_slice_table(target_table, pa_target_table): lower_bounds = bounds[::2] result = plc.copying.slice(target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): - assert_table_eq(slice_, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], slice_) def test_split_column(target_column, pa_target_column): @@ -600,7 +600,7 @@ def test_split_table(target_table, pa_target_table): lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_table_eq(split, pa_target_table[lb:ub]) + assert_table_eq(pa_target_table[lb:ub], split) def test_copy_if_else_column_column( @@ -753,7 +753,7 @@ def test_boolean_mask_scatter_from_table( pa_source_table, pa_mask, pa_target_table ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): @@ -828,7 +828,7 @@ def test_boolean_mask_scatter_from_scalars( pa_target_table, ) - assert_table_eq(result, expected) + assert_table_eq(expected, result) def test_get_element(input_column, pa_input_column): diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index b8b914f3f09..32d79257f4f 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -27,7 +27,7 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): expect = pa.concat_arrays(interleaved_data) - assert_column_eq(res, expect) + assert_column_eq(expect, res) @pytest.mark.parametrize("cnt", [0, 1, 3]) @@ -40,4 +40,4 @@ def test_tile(reshape_data, reshape_plc_tbl, cnt): tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema ) - assert_table_eq(res, expect) + assert_table_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index dd7e96e871b..818d6e6e72a 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -37,7 +37,7 @@ def plc_data(pa_data): def test_capitalize(plc_data, pa_data): got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_title(plc_data, pa_data): @@ -45,10 +45,10 @@ def test_title(plc_data, pa_data): plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) expected = pa.compute.utf8_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) def test_is_title(plc_data, pa_data): got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) - assert_column_eq(got, expected) + assert_column_eq(expected, got) From 6176776e1f88718d802b317f506e2b56635fa31a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 3 Jun 2024 15:06:39 -0700 Subject: [PATCH 03/48] Improve options docs (#15888) Recently I have answered a few user questions about how to use cudf options for display. We were missing documentation that explained that display options are inherited from pandas. I also found a broken link in the docs. This PR fixes both of those doc-related issues. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15888 --- docs/cudf/source/cudf_pandas/how-it-works.md | 5 ++--- docs/cudf/source/user_guide/api_docs/options.rst | 13 +++++++++++++ docs/cudf/source/user_guide/options.md | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md index ee856c84b78..75f57742ac9 100644 --- a/docs/cudf/source/cudf_pandas/how-it-works.md +++ b/docs/cudf/source/cudf_pandas/how-it-works.md @@ -34,6 +34,5 @@ correct result. Data is automatically transferred from host to device transfers. When using `cudf.pandas`, cuDF's [pandas compatibility -mode](https://docs.rapids.ai/api/cudf/stable/api_docs/options/#available-options) -is automatically enabled, ensuring consistency with pandas-specific -semantics like default sort ordering. +mode](api.options) is automatically enabled, ensuring consistency with +pandas-specific semantics like default sort ordering. diff --git a/docs/cudf/source/user_guide/api_docs/options.rst b/docs/cudf/source/user_guide/api_docs/options.rst index b3a4004e2d9..4c0f6684b76 100644 --- a/docs/cudf/source/user_guide/api_docs/options.rst +++ b/docs/cudf/source/user_guide/api_docs/options.rst @@ -12,6 +12,19 @@ Options and settings cudf.describe_option cudf.option_context +Display options are controlled by pandas +---------------------------------------- + +Options for display are inherited from pandas. This includes commonly accessed options such as: + +- ``display.max_columns`` +- ``display.max_info_rows`` +- ``display.max_rows`` +- ``display.max_seq_items`` + +For example, to show all rows of a DataFrame or Series in a Jupyter notebook, call ``pandas.set_option("display.max_rows", None)``. + +See also the :ref:`full list of pandas display options `. Available options ----------------- diff --git a/docs/cudf/source/user_guide/options.md b/docs/cudf/source/user_guide/options.md index 245d3fd1974..997681212fb 100644 --- a/docs/cudf/source/user_guide/options.md +++ b/docs/cudf/source/user_guide/options.md @@ -11,4 +11,4 @@ When no argument is provided, all options are printed. To set value to a option, use {py:func}`cudf.set_option`. -See the [API reference](api.options) for more details. +See the [options API reference](api.options) for descriptions of the available options. From 4a0b59133ed56c043fc73d24785f24be0b4fbe69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:08:31 -0700 Subject: [PATCH 04/48] Update Python labels and remove unnecessary ones (#15893) This PR leverages some of the new labels we have for organizing our issues and removes labels that aren't really used at the moment. If reviewers feel strongly I can keep the ci label, but AFAICT that doesn't really get used for anything at the moment and we'll benefit more from leveraging future labels to help direct tasks to the build/infra team vs cudf devs. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15893 --- .github/labeler.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index d14344384d1..48967417af3 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,9 +1,19 @@ # Documentation for config - https://github.com/actions/labeler#common-examples -cuDF (Python): +Python: - 'python/**' - 'notebooks/**' +cudf.pandas: + - 'python/cudf/cudf/pandas/**' + - 'python/cudf/cudf_pandas_tests/**' + +cudf.polars: + - 'python/cudf_polars/**' + +pylibcudf: + - 'python/cudf/cudf/_lib/pylibcudf/**' + libcudf: - 'cpp/**' @@ -12,11 +22,5 @@ CMake: - '**/cmake/**' - '**/*.cmake' -cuDF (Java): +Java: - 'java/**' - -ci: - - 'ci/**' - -conda: - - 'conda/**' From 382de32e8137a3a59a0800f46ef8a1de62b1a6e5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 3 Jun 2024 15:14:52 -0700 Subject: [PATCH 05/48] Add support for additional metaclasses of proxies and use for ExcelWriter (#15399) The ExcelWriter supports the abstract os.PathLike interface, but we would also like that support to be reflected in the class's MRO. Doing so is slightly complicated because os.PathLike is an ABC, and as such has a different metaclass. Therefore, in order to add os.PathLike as a base class, we must also generate a suitable combined metaclass for our ExcelWriter wrapper. This change ensures the `isinstance(pd.ExcelWriter(...), os.PathLike)` returns `True` when using cudf.pandas. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15399 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 11 +++++-- python/cudf/cudf/pandas/fast_slow_proxy.py | 30 +++++++------------ .../cudf_pandas_tests/test_cudf_pandas.py | 5 ++++ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 2e3880e14f6..698dd946022 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import abc import copyreg import importlib +import os import pickle import sys @@ -857,7 +859,12 @@ def Index__new__(cls, *args, **kwargs): pd.ExcelWriter, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, + additional_attributes={ + "__hash__": _FastSlowAttribute("__hash__"), + "__fspath__": _FastSlowAttribute("__fspath__"), + }, + bases=(os.PathLike,), + metaclasses=(abc.ABCMeta,), ) try: @@ -1032,7 +1039,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, - meta_class=pd_HolidayCalendarMetaClass, + metaclasses=(pd_HolidayCalendarMetaClass,), ) Holiday = make_final_proxy_type( diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 94caec1ce6c..169dd80e132 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -106,19 +106,6 @@ def __call__(self): _DELETE = object() -def create_composite_metaclass(base_meta, additional_meta): - """ - Dynamically creates a composite metaclass that inherits from both provided metaclasses. - This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved. - """ - - class CompositeMeta(base_meta, additional_meta): - def __new__(cls, name, bases, namespace): - return super().__new__(cls, name, bases, namespace) - - return CompositeMeta - - def make_final_proxy_type( name: str, fast_type: type, @@ -130,7 +117,7 @@ def make_final_proxy_type( additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, bases: Tuple = (), - meta_class=None, + metaclasses: Tuple = (), ) -> Type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow @@ -161,6 +148,8 @@ def make_final_proxy_type( construct said unwrapped object. See also `_maybe_wrap_result`. bases Optional tuple of base classes to insert into the mro. + metaclasses + Optional tuple of metaclasses to unify with the base proxy metaclass. Notes ----- @@ -241,15 +230,18 @@ def _fsproxy_state(self) -> _State: cls_dict[slow_name] = _FastSlowAttribute( slow_name, private=slow_name.startswith("_") ) - if meta_class is None: - meta_class = _FastSlowProxyMeta - else: - meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class) + metaclass = _FastSlowProxyMeta + if metaclasses: + metaclass = types.new_class( # type: ignore + f"{name}_Meta", + metaclasses + (_FastSlowProxyMeta,), + {}, + ) cls = types.new_class( name, (*bases, _FinalProxy), - {"metaclass": meta_class}, + {"metaclass": metaclass}, lambda ns: ns.update(cls_dict), ) functools.update_wrapper( diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 75bceea3034..fef829b17fc 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -6,6 +6,7 @@ import copy import datetime import operator +import os import pathlib import pickle import tempfile @@ -1421,3 +1422,7 @@ def test_holidays_within_dates(holiday, start, expected): utc.localize(xpd.Timestamp(start)), ) ) == [utc.localize(dt) for dt in expected] + + +def test_excelwriter_pathlike(): + assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) From eb460169786665b1624cb6c4f9b502b800810b37 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 4 Jun 2024 06:32:49 -0500 Subject: [PATCH 06/48] Migrate column factories to pylibcudf (#15257) This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15257 --- cpp/src/column/column_factories.cpp | 17 +- cpp/tests/column/factories_test.cpp | 4 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 2 +- .../api_docs/pylibcudf/column_factories.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/column.pyx | 21 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 +- .../cudf/_lib/pylibcudf/column_factories.pxd | 52 ++++ .../cudf/_lib/pylibcudf/column_factories.pyx | 205 ++++++++++++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 82 ++++++ .../libcudf/column/column_factories.pxd | 73 ++++- python/cudf/cudf/_lib/pylibcudf/types.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +- .../pylibcudf_tests/test_column_factories.py | 253 ++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_interop.py | 69 +++++ 17 files changed, 767 insertions(+), 29 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_factories.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_interop.py diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index e40056fc8a1..0260068d4db 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type) std::unique_ptr make_empty_column(data_type type) { CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type), - "make_empty_column is invalid to call on nested types"); + "make_empty_column is invalid to call on nested types", + cudf::data_type_error); return std::make_unique(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } @@ -80,7 +81,9 @@ std::unique_ptr make_numeric_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type), + "Invalid, non-numeric type.", + cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -100,7 +103,7 @@ std::unique_ptr make_fixed_point_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); + CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -120,7 +123,7 @@ std::unique_ptr make_timestamp_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); + CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -140,7 +143,7 @@ std::unique_ptr make_duration_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); + CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -160,7 +163,9 @@ std::unique_ptr make_fixed_width_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type), + "Invalid, non-fixed-width type.", + cudf::data_type_error); // clang-format off if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr); diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index afebc91dd73..dca36eaa4e7 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow) auto column = cudf::make_numeric_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonNumeric, @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow) auto column = cudf::make_fixed_width_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonFixedWidth, diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 73de1fbaa68..ab7984d4b03 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID) { EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0), - cudf::logic_error); + cudf::data_type_error); } TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst new file mode 100644 index 00000000000..c858135b6ce --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst @@ -0,0 +1,6 @@ +================ +column_factories +================ + +.. automodule:: cudf._lib.pylibcudf.column_factories + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 26875ce7d12..58fea77adaa 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf. aggregation binaryop column + column_factories concatenate copying filling diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f33e121241d..7155017b7af 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -39,14 +39,10 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column -# TODO: We currently need this for "casting" empty pylibcudf columns in -# from_pylibcudf by instead creating an empty numeric column. We will be able -# to remove this once column factories are exposed to pylibcudf. cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary -from cudf._lib.pylibcudf cimport Column as plc_Column from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( make_column_from_scalar as cpp_make_column_from_scalar, @@ -623,22 +619,17 @@ cdef class Column: pylibcudf.Column A new pylibcudf.Column referencing the same data. """ - cdef libcudf_types.data_type new_dtype if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: col = pylibcudf.unary.cast( col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) ) elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - # TODO: This function call is what requires cimporting pylibcudf. - # We can remove the cimport once we can directly do - # pylibcudf.column_factories.make_numeric_column or equivalent. - col = plc_Column.from_libcudf( - move( - make_numeric_column( - new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL - ) - ) + new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) + + col = pylibcudf.column_factories.make_numeric_column( + new_dtype, + col.size(), + pylibcudf.column_factories.MaskState.ALL_NULL ) dtype = dtype_from_pylibcudf_column(col) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index eff14ad549b..7d0676f6def 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -16,6 +16,7 @@ set(cython_sources aggregation.pyx binaryop.pyx column.pyx + column_factories.pyx concatenate.pyx copying.pyx filling.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 4f77f8cbaef..b289d112a90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -40,6 +41,7 @@ __all__ = [ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 048b62b6013..2565332f3ed 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -3,6 +3,7 @@ from . import ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -27,7 +28,7 @@ from .gpumemoryview import gpumemoryview from .scalar import Scalar from .table import Table -from .types import DataType, TypeId +from .types import DataType, MaskState, TypeId __all__ = [ "Column", @@ -39,6 +40,7 @@ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd new file mode 100644 index 00000000000..9dbd74ab16c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .column cimport Column +from .types cimport DataType, size_type, type_id + +ctypedef fused MakeEmptyColumnOperand: + DataType + type_id + object + +ctypedef fused MaskArg: + mask_state + object + + +cpdef Column make_empty_column( + MakeEmptyColumnOperand type_or_id +) + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mask, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx new file mode 100644 index 00000000000..ef7f512f0e5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( + make_duration_column as cpp_make_duration_column, + make_empty_column as cpp_make_empty_column, + make_fixed_point_column as cpp_make_fixed_point_column, + make_fixed_width_column as cpp_make_fixed_width_column, + make_numeric_column as cpp_make_numeric_column, + make_timestamp_column as cpp_make_timestamp_column, +) +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .types cimport DataType, type_id + +from .types import MaskState, TypeId + + +cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + cdef unique_ptr[column] result + cdef type_id id + + if MakeEmptyColumnOperand is object: + if isinstance(type_or_id, TypeId): + id = type_or_id + with nogil: + result = move( + cpp_make_empty_column( + id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + elif MakeEmptyColumnOperand is DataType: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id.c_obj + ) + ) + elif MakeEmptyColumnOperand is type_id: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + return Column.from_libcudf(move(result)) + + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_numeric_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_point_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_timestamp_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_duration_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_width_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f172080cece..1e4102e4b64 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -33,6 +33,33 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +ARROW_TO_PYLIBCUDF_TYPES = { + pa.int8(): type_id.INT8, + pa.int16(): type_id.INT16, + pa.int32(): type_id.INT32, + pa.int64(): type_id.INT64, + pa.uint8(): type_id.UINT8, + pa.uint16(): type_id.UINT16, + pa.uint32(): type_id.UINT32, + pa.uint64(): type_id.UINT64, + pa.float32(): type_id.FLOAT32, + pa.float64(): type_id.FLOAT64, + pa.bool_(): type_id.BOOL8, + pa.string(): type_id.STRING, + pa.duration('s'): type_id.DURATION_SECONDS, + pa.duration('ms'): type_id.DURATION_MILLISECONDS, + pa.duration('us'): type_id.DURATION_MICROSECONDS, + pa.duration('ns'): type_id.DURATION_NANOSECONDS, + pa.timestamp('s'): type_id.TIMESTAMP_SECONDS, + pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS, + pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, + pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, + pa.date32(): type_id.TIMESTAMP_DAYS, +} + +LIBCUDF_TO_ARROW_TYPES = { + v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items() +} cdef column_metadata _metadata_to_libcudf(metadata): """Convert a ColumnMetadata object to C++ column_metadata. @@ -77,6 +104,21 @@ def from_arrow(pyarrow_object, *, DataType data_type=None): raise TypeError("from_arrow only accepts Table and Scalar objects") +@from_arrow.register(pa.DataType) +def _from_arrow_datatype(pyarrow_object): + if isinstance(pyarrow_object, pa.Decimal128Type): + return DataType(type_id.DECIMAL128, scale=-pyarrow_object.scale) + elif isinstance(pyarrow_object, pa.StructType): + return DataType(type_id.STRUCT) + elif isinstance(pyarrow_object, pa.ListType): + return DataType(type_id.LIST) + else: + try: + return DataType(ARROW_TO_PYLIBCUDF_TYPES[pyarrow_object]) + except KeyError: + raise TypeError(f"Unable to convert {pyarrow_object} to cudf datatype") + + @from_arrow.register(pa.Table) def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: @@ -170,6 +212,46 @@ def to_arrow(cudf_object, metadata=None): raise TypeError("to_arrow only accepts Table and Scalar objects") +@to_arrow.register(DataType) +def _to_arrow_datatype(cudf_object, **kwargs): + """ + Convert a datatype to arrow. + + Translation of some types requires extra information as a keyword + argument. Specifically: + + - When translating a decimal type, provide ``precision`` + - When translating a struct type, provide ``fields`` + - When translating a list type, provide the wrapped ``value_type`` + """ + if cudf_object.id() in {type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128}: + if not (precision := kwargs.get("precision")): + raise ValueError( + "Precision must be provided for decimal types" + ) + # no pa.decimal32 or pa.decimal64 + return pa.decimal128(precision, -cudf_object.scale()) + elif cudf_object.id() == type_id.STRUCT: + if not (fields := kwargs.get("fields")): + raise ValueError( + "Fields must be provided for struct types" + ) + return pa.struct(fields) + elif cudf_object.id() == type_id.LIST: + if not (value_type := kwargs.get("value_type")): + raise ValueError( + "Value type must be provided for list types" + ) + return pa.list_(value_type) + else: + try: + return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + except KeyError: + raise TypeError( + f"Unable to convert {cudf_object.id()} to arrow datatype" + ) + + @to_arrow.register(Table) def _to_arrow_table(cudf_object, metadata=None): if metadata is None: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd index fd22d92cb30..2faff21a77b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd @@ -2,9 +2,17 @@ from libcpp.memory cimport unique_ptr +from rmm._lib.device_buffer cimport device_buffer + from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + bitmask_type, + data_type, + mask_state, + size_type, + type_id, +) cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: @@ -12,5 +20,64 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: size_type size, mask_state state) except + - cdef unique_ptr[column] make_column_from_scalar (const scalar & s, - size_type size) except + + cdef unique_ptr[column] make_numeric_column(data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_column_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_empty_column(type_id id) except + + cdef unique_ptr[column] make_empty_column(data_type type_) except + + + cdef unique_ptr[column] make_dictionary_column( + unique_ptr[column] keys_column, + unique_ptr[column] indices_column) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e54a259819e..7d3ddca14a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -13,6 +13,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( null_order, null_policy, order, + size_type, sorted, type_id, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index a5248ad0a1f..6dbb287f3c4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -8,6 +8,7 @@ from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId # no-cython-lin from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, isort:skip @@ -22,7 +23,7 @@ cdef class DataType: Parameters ---------- - id : TypeId + id : type_id The type's identifier scale : int The scale associated with the data. Only used for decimal data types. diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py new file mode 100644 index 00000000000..4c05770a41f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq + +from cudf._lib import pylibcudf as plc + +EMPTY_COL_SIZE = 3 + +NUMERIC_TYPES = [ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.float32(), + pa.float64(), + pa.bool_(), +] + +TIMESTAMP_TYPES = [ + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + pa.timestamp("ns"), +] + +DURATION_TYPES = [ + pa.duration("s"), + pa.duration("ms"), + pa.duration("us"), + pa.duration("ns"), +] + +DECIMAL_TYPES = [pa.decimal128(38, 2)] + +STRING_TYPES = [pa.string()] +STRUCT_TYPES = [DEFAULT_STRUCT_TESTING_TYPE] +LIST_TYPES = [pa.list_(pa.int64())] + +ALL_TYPES = ( + NUMERIC_TYPES + + TIMESTAMP_TYPES + + DURATION_TYPES + + STRING_TYPES + + DECIMAL_TYPES + + STRUCT_TYPES + + LIST_TYPES +) + + +@pytest.fixture(scope="module", params=NUMERIC_TYPES, ids=repr) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DECIMAL_TYPES, + ids=repr, +) +def fixed_point_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=TIMESTAMP_TYPES, + ids=repr, +) +def timestamp_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DURATION_TYPES, + ids=repr, +) +def duration_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + plc.MaskState.UNALLOCATED, + plc.MaskState.ALL_VALID, + plc.MaskState.ALL_NULL, + plc.MaskState.UNINITIALIZED, + ], + ids=["unallocated", "all_valid", "all_null", "uninitialized"], +) +def mask_state(request): + return request.param + + +def test_make_empty_column_dtype(pa_type): + pa_col = pa.array([], type=pa_type) + + plc_type = plc.interop.from_arrow(pa_col).type() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(plc_type) + return + + cudf_col = plc.column_factories.make_empty_column(plc_type) + assert_column_eq(cudf_col, pa_col) + + +def test_make_empty_column_typeid(pa_type): + pa_col = pa.array([], type=pa_type) + + tid = plc.interop.from_arrow(pa_col).type().id() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(tid) + return + + cudf_col = plc.column_factories.make_empty_column(tid) + assert_column_eq(cudf_col, pa_col) + + +def validate_empty_column(col, mask_state, dtype): + assert col.size() == EMPTY_COL_SIZE + + if mask_state == plc.types.MaskState.UNALLOCATED: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_VALID: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_NULL: + assert col.null_count() == EMPTY_COL_SIZE + + assert plc.interop.to_arrow(col).type == dtype + + +def test_make_numeric_column(numeric_pa_type, mask_state): + plc_type = plc.interop.from_arrow(numeric_pa_type) + + got = plc.column_factories.make_numeric_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, numeric_pa_type) + + +@pytest.mark.parametrize( + "non_numeric_pa_type", [t for t in ALL_TYPES if t not in NUMERIC_TYPES] +) +def test_make_numeric_column_dtype_err(non_numeric_pa_type): + plc_type = plc.interop.from_arrow(non_numeric_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_numeric_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_numeric_column_negative_size_err(numeric_pa_type): + plc_type = plc.interop.from_arrow(numeric_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_numeric_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column(fixed_point_pa_type, mask_state): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + + got = plc.column_factories.make_fixed_point_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + + validate_empty_column(got, mask_state, fixed_point_pa_type) + + +@pytest.mark.parametrize( + "non_fixed_point_pa_type", [t for t in ALL_TYPES if t not in DECIMAL_TYPES] +) +def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type): + plc_type = plc.interop.from_arrow(non_fixed_point_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_fixed_point_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column_negative_size_err(fixed_point_pa_type): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_fixed_point_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column(timestamp_pa_type, mask_state): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + + got = plc.column_factories.make_timestamp_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, timestamp_pa_type) + + +@pytest.mark.parametrize( + "non_timestamp_pa_type", [t for t in ALL_TYPES if t not in TIMESTAMP_TYPES] +) +def test_make_timestamp_column_dtype_err(non_timestamp_pa_type): + plc_type = plc.interop.from_arrow(non_timestamp_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_timestamp_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column_negative_size_err(timestamp_pa_type): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_timestamp_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column(duration_pa_type, mask_state): + plc_type = plc.interop.from_arrow(duration_pa_type) + + got = plc.column_factories.make_duration_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, duration_pa_type) + + +@pytest.mark.parametrize( + "non_duration_pa_type", [t for t in ALL_TYPES if t not in DURATION_TYPES] +) +def test_make_duration_column_dtype_err(non_duration_pa_type): + plc_type = plc.interop.from_arrow(non_duration_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_duration_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column_negative_size_err(duration_pa_type): + plc_type = plc.interop.from_arrow(duration_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_duration_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/cudf/cudf/pylibcudf_tests/test_interop.py new file mode 100644 index 00000000000..5c05f460e28 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_interop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + + +def test_list_dtype_roundtrip(): + list_type = pa.list_(pa.int32()) + plc_type = plc.interop.from_arrow(list_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.LIST) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, value_type=list_type.value_type + ) + assert arrow_type == list_type + + +def test_struct_dtype_roundtrip(): + struct_type = pa.struct([("a", pa.int32()), ("b", pa.string())]) + plc_type = plc.interop.from_arrow(struct_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.STRUCT) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, + fields=[struct_type.field(i) for i in range(struct_type.num_fields)], + ) + assert arrow_type == struct_type + + +def test_decimal128_roundtrip(): + decimal_type = pa.decimal128(10, 2) + plc_type = plc.interop.from_arrow(decimal_type) + + assert plc_type.id() == plc.types.TypeId.DECIMAL128 + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, precision=decimal_type.precision + ) + assert arrow_type == decimal_type + + +@pytest.mark.parametrize( + "data_type", + [ + plc.types.DataType(plc.types.TypeId.DECIMAL32), + plc.types.DataType(plc.types.TypeId.DECIMAL64), + ], +) +def test_decimal_other(data_type): + precision = 3 + + with pytest.raises(ValueError): + plc.interop.to_arrow(data_type) + + arrow_type = plc.interop.to_arrow(data_type, precision=precision) + assert arrow_type == pa.decimal128(precision, 0) From fc31aa3c4f99d6348e7c32a3e3c52c68b26ca700 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:19:30 -0400 Subject: [PATCH 07/48] Add overflow check when converting large strings to lists columns (#15887) Fixes a couple places where strings columns are converted to lists column as binary -- chars are represented as INT8. Since lists columns only support `size_type` offsets type, this change will throw an error if the size of the chars exceeds max `size_type` values. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15887 --- cpp/src/io/utilities/column_buffer.cpp | 4 ++++ cpp/src/reshape/byte_cast.cu | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index e5d4e1a360f..27fc53fbc9e 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -191,6 +191,10 @@ std::unique_ptr make_column(column_buffer_base& buffer, auto data = col_content.data.release(); auto char_size = data->size(); + CUDF_EXPECTS(char_size < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + auto uint8_col = std::make_unique( data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0); diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 1b05a9744fa..3dfa0b65814 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -135,9 +135,14 @@ struct byte_list_conversion_fn(input, stream, mr)->release(); - auto const num_chars = col_content.data->size(); - auto uint8_col = std::make_unique( + auto const num_chars = strings_column_view(input).chars_size(stream); + CUDF_EXPECTS(num_chars < static_cast(std::numeric_limits::max()), + "Cannot convert strings column to lists column due to size_type limit", + std::overflow_error); + + auto col_content = std::make_unique(input, stream, mr)->release(); + + auto uint8_col = std::make_unique( output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0); auto result = make_lists_column( From 54d49fcea4e7ad73df21f0dbfe99097c635b1023 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 4 Jun 2024 16:17:25 +0100 Subject: [PATCH 08/48] Ensure literals have correct dtype (#15890) The polars schema tells us the dtype for any literals, but previously we were relying on pyarrow inference. Add pylibcudf to pyarrow datatype conversion utilities and use the resulting datatypes explicitly. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/15890 --- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 3 +- python/cudf_polars/cudf_polars/dsl/expr.py | 46 +++++++++++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 10 ++-- .../cudf_polars/cudf_polars/dsl/translate.py | 9 ++-- .../cudf_polars/cudf_polars/utils/dtypes.py | 3 ++ python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/__init__.py | 6 +++ .../cudf_polars/tests/expressions/__init__.py | 6 +++ .../cudf_polars/tests/expressions/test_agg.py | 2 +- .../tests/expressions/test_distinct.py | 36 +++++++++++++++ python/cudf_polars/tests/test_scan.py | 12 +---- 11 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 python/cudf_polars/tests/__init__.py create mode 100644 python/cudf_polars/tests/expressions/__init__.py create mode 100644 python/cudf_polars/tests/expressions/test_distinct.py diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 1e4102e4b64..07e9d1ead11 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -55,6 +55,7 @@ ARROW_TO_PYLIBCUDF_TYPES = { pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, pa.date32(): type_id.TIMESTAMP_DAYS, + pa.null(): type_id.EMPTY, } LIBCUDF_TO_ARROW_TYPES = { @@ -245,7 +246,7 @@ def _to_arrow_datatype(cudf_object, **kwargs): return pa.list_(value_type) else: try: - return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + return LIBCUDF_TO_ARROW_TYPES[cudf_object.id()] except KeyError: raise TypeError( f"Unable to convert {cudf_object.id()} to arrow datatype" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 249cc3775f7..7187a36f21c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -484,32 +484,48 @@ def do_evaluate( return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsLastDistinct: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsUnique: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.IsDuplicated: (column,) = columns return self._distinct( column, keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 - target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + source_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: name = columns[0].name @@ -717,7 +733,9 @@ def do_evaluate( bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY obj = plc.replace.replace_nulls( indices.obj, - plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()), + plc.interop.from_arrow( + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type())) + ), ) else: bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK @@ -893,11 +911,13 @@ def _reduce( ) def _count(self, column: Column) -> Column: - # TODO: dtype handling return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(column.obj.size() - column.obj.null_count()), + pa.scalar( + column.obj.size() - column.obj.null_count(), + type=plc.interop.to_arrow(self.dtype), + ), ), 1, ), @@ -909,7 +929,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), @@ -924,7 +944,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: return Column( plc.Column.from_scalar( plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=self.dtype + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) ), 1, ), diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index d630b40f600..f8441b793b5 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -146,9 +146,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: assert_never(self.typ) if row_index is not None: name, offset = row_index - # TODO: dtype - step = plc.interop.from_arrow(pa.scalar(1)) - init = plc.interop.from_arrow(pa.scalar(offset)) + dtype = self.schema[name] + step = plc.interop.from_arrow( + pa.scalar(1, type=plc.interop.to_arrow(dtype)) + ) + init = plc.interop.from_arrow( + pa.scalar(offset, type=plc.interop.to_arrow(dtype)) + ) index = Column( plc.filling.sequence(df.num_rows, init, step), name ).set_sorted( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b3d0edf183f..9a301164beb 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -9,9 +9,11 @@ from functools import singledispatch from typing import Any +import pyarrow as pa + from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir -import cudf._lib.pylibcudf as plc # noqa: TCH002, singledispatch register needs this name defined. +import cudf._lib.pylibcudf as plc from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes @@ -295,7 +297,8 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: - return expr.Literal(dtype, node.value) + value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) + return expr.Literal(dtype, value) @_translate_expr.register @@ -337,7 +340,7 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): - return expr.Literal(dtype, inner.value) + return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) else: return expr.Cast(dtype, inner) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 51379433c03..bede0de3c9f 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,6 +13,8 @@ import cudf._lib.pylibcudf as plc +__all__ = ["from_polars"] + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: @@ -84,6 +86,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) elif isinstance(dtype, pl.List): + # TODO: This doesn't consider the value type. return plc.DataType(plc.TypeId.LIST) else: raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 49ecd7080b9..e50ee76a9b9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -134,7 +134,7 @@ ignore = [ fixable = ["ALL"] [tool.ruff.lint.per-file-ignores] -"**/tests/**/test_*.py" = ["D", "INP"] +"**/tests/**/*.py" = ["D"] [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style diff --git a/python/cudf_polars/tests/__init__.py b/python/cudf_polars/tests/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/__init__.py b/python/cudf_polars/tests/expressions/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/expressions/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index c792ae64f74..645dbd26140 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,7 +56,7 @@ def test_agg(df, agg): q = df.select(expr) # https://github.com/rapidsai/cudf/issues/15852 - check_dtype = agg not in {"count", "n_unique", "median"} + check_dtype = agg not in {"n_unique", "median"} if not check_dtype and q.schema["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py new file mode 100644 index 00000000000..22865a7ce22 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) +def nullable(request): + return request.param + + +@pytest.fixture( + params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] +) +def op(request): + return request.param + + +@pytest.fixture +def df(nullable): + values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] + if nullable: + values[1] = None + values[4] = None + return pl.LazyFrame({"a": values}) + + +def test_expr_distinct(df, op): + expr = getattr(pl.col("a"), op)() + query = df.select(expr) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index b75e1bdef10..b2443e357e2 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -10,17 +10,7 @@ @pytest.fixture( - params=[ - (None, None), - pytest.param( - ("row-index", 0), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - pytest.param( - ("index", 10), - marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), - ), - ], + params=[(None, None), ("row-index", 0), ("index", 10)], ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], ) def row_index(request): From faf39299ebf178ee10971e4222c534f00d035b6d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:52:51 -1000 Subject: [PATCH 09/48] Make Frame.astype return Self instead of a ColumnAccessor (#15861) Allows simplification for it's subclasses (`IndexFrame.astype`, `Index.astype`) Also minor cleanups in the `equals` method Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15861 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 23 ++++++----------------- python/cudf/cudf/core/index.py | 22 ++++++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 14 +++++--------- 5 files changed, 27 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e6868ae3431..baca7b19e58 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -149,7 +149,7 @@ def ndim(self) -> int: # noqa: D401 """Number of dimensions of the underlying data, by definition 1.""" return 1 - def equals(self, other): + def equals(self, other) -> bool: """ Determine if two Index objects contain the same elements. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index acfc2d781a7..0fc36fa80e4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2590,7 +2590,7 @@ def items(self): yield (k, self[k]) @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: ret = super().equals(other) # If all other checks matched, validate names. if ret: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d60c206ac24..7326696c994 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -273,20 +273,13 @@ def __len__(self) -> int: return self._num_rows @_cudf_nvtx_annotate - def astype(self, dtype, copy: bool = False): - result_data = { - col_name: col.astype(dtype.get(col_name, col.dtype), copy=copy) + def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: + casted = ( + col.astype(dtype.get(col_name, col.dtype), copy=copy) for col_name, col in self._data.items() - } - - return ColumnAccessor( - data=result_data, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - rangeindex=self._data.rangeindex, - label_dtype=self._data.label_dtype, - verify=False, ) + ca = self._data._from_columns_like_self(casted, verify=False) + return self._from_data_like_self(ca) @_cudf_nvtx_annotate def equals(self, other) -> bool: @@ -349,11 +342,7 @@ def equals(self, other) -> bool: """ if self is other: return True - if ( - other is None - or not isinstance(other, type(self)) - or len(self) != len(other) - ): + if not isinstance(other, type(self)) or len(self) != len(other): return False return all( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 2a75b374a1e..9b4c5473438 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -445,7 +445,7 @@ def __getitem__(self, index): return self._as_int_index()[index] @_cudf_nvtx_annotate - def equals(self, other): + def equals(self, other) -> bool: if isinstance(other, RangeIndex): return self._range == other._range return self._as_int_index().equals(other) @@ -1058,6 +1058,16 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out + @classmethod + @_cudf_nvtx_annotate + def _from_data_like_self( + cls, data: MutableMapping, name: Any = no_default + ) -> Self: + out = _index_from_data(data, name) + if name is not no_default: + out.name = name + return out + @classmethod @_cudf_nvtx_annotate def from_arrow(cls, obj): @@ -1180,12 +1190,8 @@ def is_unique(self): return self._column.is_unique @_cudf_nvtx_annotate - def equals(self, other): - if ( - other is None - or not isinstance(other, BaseIndex) - or len(self) != len(other) - ): + def equals(self, other) -> bool: + if not isinstance(other, BaseIndex) or len(self) != len(other): return False check_dtypes = False @@ -1231,7 +1237,7 @@ def copy(self, name=None, deep=False): @_cudf_nvtx_annotate def astype(self, dtype, copy: bool = True): - return _index_from_data(super().astype({self.name: dtype}, copy)) + return super().astype({self.name: dtype}, copy) @_cudf_nvtx_annotate def get_indexer(self, target, method=None, limit=None, tolerance=None): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a31430e1571..5a466f20f8c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -625,10 +625,8 @@ def copy(self, deep: bool = True) -> Self: ) @_cudf_nvtx_annotate - def equals(self, other): # noqa: D102 - if not super().equals(other): - return False - return self.index.equals(other.index) + def equals(self, other) -> bool: # noqa: D102 + return super().equals(other) and self.index.equals(other.index) @property def index(self): @@ -4896,10 +4894,10 @@ def repeat(self, repeats, axis=None): def astype( self, - dtype, + dtype: dict[Any, Dtype], copy: bool = False, errors: Literal["raise", "ignore"] = "raise", - ): + ) -> Self: """Cast the object to the given dtype. Parameters @@ -5010,14 +5008,12 @@ def astype( raise ValueError("invalid error value specified") try: - data = super().astype(dtype, copy) + return super().astype(dtype, copy) except Exception as e: if errors == "raise": raise e return self - return self._from_data(data, index=self.index) - @_cudf_nvtx_annotate def drop( self, From fe7412915a289e7a9469040ada1dcf74cda2c4d6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:56:25 -1000 Subject: [PATCH 10/48] Make Column.to_pandas return Index instead of Series (#15833) Column.to_pandas backs `Index.to_pandas`/`Series.to_pandas`/`DataFrame.to_pandas` and returned a `pandas.Series`; however, the `index` of this `pandas.Series` was not strictly necessary for `Index.to_pandas` and `DataFrame.to_pandas`. Additionally, `pandas.Index` is 1D-like like `Column` and provides a better mental model to `to_pandas` conversion. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15833 --- python/cudf/cudf/core/column/categorical.py | 7 ++- python/cudf/cudf/core/column/column.py | 13 ++---- python/cudf/cudf/core/column/datetime.py | 20 ++------- python/cudf/cudf/core/column/interval.py | 15 ++----- python/cudf/cudf/core/column/lists.py | 20 ++------- python/cudf/cudf/core/column/numerical.py | 17 +++---- python/cudf/cudf/core/column/string.py | 17 ++----- python/cudf/cudf/core/column/struct.py | 19 ++------ python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/index.py | 45 ++++--------------- python/cudf/cudf/core/series.py | 8 ++-- .../cudf/tests/test_cuda_array_interface.py | 4 +- 12 files changed, 46 insertions(+), 143 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 0ff8209dcd4..1828c5ce97b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -789,12 +789,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: raise NotImplementedError(f"{arrow_type=} is not implemented.") @@ -828,7 +827,7 @@ def to_pandas( data = pd.Categorical.from_codes( codes, categories=cats.to_pandas(), ordered=col.ordered ) - return pd.Series(data, index=index) + return pd.Index(data) def to_arrow(self) -> pa.Array: """Convert to PyArrow Array.""" diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 59bae179497..68079371b85 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -190,10 +190,9 @@ def __repr__(self): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: """Convert object to pandas type. The default implementation falls back to PyArrow for the conversion. @@ -208,15 +207,9 @@ def to_pandas( raise NotImplementedError(f"{nullable=} is not implemented.") pa_array = self.to_arrow() if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + return pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) else: - pd_series = pa_array.to_pandas() - - if index is not None: - pd_series.index = index - return pd_series + return pd.Index(pa_array.to_pandas()) @property def values_host(self) -> "np.ndarray": diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 27f31c8f500..057169aa7e1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -840,27 +840,15 @@ def __init__( def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - elif arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) + ) -> pd.Index: + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - series = self._local_time.to_pandas().dt.tz_localize( + return self._local_time.to_pandas().tz_localize( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - if index is not None: - series.index = index - return series def to_arrow(self): return pa.compute.assume_timezone( diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 7bd693966dc..f24ca3fdad1 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,6 +1,4 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from typing import Optional - import pandas as pd import pyarrow as pa @@ -109,28 +107,21 @@ def as_interval_column(self, dtype): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of # self.to_arrow) is currently the best known way to convert interval # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: raise NotImplementedError(f"{arrow_type=} is not implemented.") pd_type = self.dtype.to_pandas() - return pd.Series( - pd_type.__from_arrow__(self.to_arrow()), index=index, dtype=pd_type - ) + return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type) def element_indexing(self, index: int): result = super().element_indexing(index) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1c2bcbef2ec..8f8ee46c796 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -292,25 +292,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - # Can't rely on Column.to_pandas implementation for lists. - # Need to perform `to_pylist` to preserve list types. - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - pa_array = self.to_arrow() - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + ) -> pd.Index: + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - return pd.Series(pa_array.tolist(), dtype="object", index=index) + return pd.Index(self.to_arrow().tolist(), dtype="object") class ListMethods(ColumnMethods): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index bab862f775f..fb413959eb9 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -674,18 +674,13 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) elif ( nullable and ( @@ -697,11 +692,11 @@ def to_pandas( ): arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) # type: ignore[attr-defined] - return pd.Series(pandas_array, copy=False, index=index) + return pd.Index(pandas_array, copy=False) elif self.dtype.kind in set("iuf") and not self.has_nulls(): - return pd.Series(self.values_host, copy=False, index=index) + return pd.Index(self.values_host, copy=False) else: - return super().to_pandas(index=index, nullable=nullable) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def _reduction_result_dtype(self, reduction_op: str) -> Dtype: col_dtype = self.dtype diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 40e58e14612..fd98d0dc163 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5783,23 +5783,14 @@ def values(self) -> cupy.ndarray: def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index - ) - elif nullable: + ) -> pd.Index: + if nullable and not arrow_type: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) - return pd.Series(pandas_array, copy=False, index=index) + return pd.Index(pandas_array, copy=False) else: - return super().to_pandas(index=index, nullable=nullable) + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def can_cast_safely(self, to_dtype: Dtype) -> bool: to_dtype = cudf.api.types.dtype(to_dtype) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 1b2ffcc2700..6dd35570b95 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -2,7 +2,6 @@ from __future__ import annotations from functools import cached_property -from typing import Optional import pandas as pd import pyarrow as pa @@ -60,25 +59,15 @@ def to_arrow(self): def to_pandas( self, *, - index: Optional[pd.Index] = None, nullable: bool = False, arrow_type: bool = False, - ) -> pd.Series: + ) -> pd.Index: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - pa_array = self.to_arrow() - if arrow_type: - return pd.Series( - pd.arrays.ArrowExtensionArray(pa_array), index=index - ) + if arrow_type or nullable: + return super().to_pandas(nullable=nullable, arrow_type=arrow_type) else: - return pd.Series(pa_array.tolist(), dtype="object", index=index) + return pd.Index(self.to_arrow().tolist(), dtype="object") @cached_property def memory_usage(self): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0fc36fa80e4..4c55b5427de 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5321,9 +5321,7 @@ def to_pandas( """ out_index = self.index.to_pandas() out_data = { - i: col.to_pandas( - index=out_index, nullable=nullable, arrow_type=arrow_type - ) + i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) for i, col in enumerate(self._data.columns) } diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 9b4c5473438..4b09765fa46 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1568,10 +1568,11 @@ def any(self): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.Index: - return pd.Index( - self._values.to_pandas(nullable=nullable, arrow_type=arrow_type), - name=self.name, + result = self._column.to_pandas( + nullable=nullable, arrow_type=arrow_type ) + result.name = self.name + return result def append(self, other): if is_list_like(other): @@ -2191,23 +2192,10 @@ def isocalendar(self): def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.DatetimeIndex: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - - result = self._values.to_pandas(arrow_type=arrow_type) - if arrow_type: - return pd.Index(result, name=self.name) - else: - freq = ( - self._freq._maybe_as_fast_pandas_offset() - if self._freq is not None - else None - ) - return pd.DatetimeIndex(result, name=self.name, freq=freq) + result = super().to_pandas(nullable=nullable, arrow_type=arrow_type) + if not arrow_type and self._freq is not None: + result.freq = self._freq._maybe_as_fast_pandas_offset() + return result @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -2527,23 +2515,6 @@ def __getitem__(self, index): return pd.Timedelta(value) return value - @_cudf_nvtx_annotate - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.TimedeltaIndex: - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - - result = self._values.to_pandas(arrow_type=arrow_type) - if arrow_type: - return pd.Index(result, name=self.name) - else: - return pd.TimedeltaIndex(result, name=self.name) - @property # type: ignore @_cudf_nvtx_annotate def days(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a5b204ef346..169f7c11cf9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2022,11 +2022,11 @@ def to_pandas( index = self.index.to_pandas() else: index = None # type: ignore[assignment] - s = self._column.to_pandas( - index=index, nullable=nullable, arrow_type=arrow_type + return pd.Series( + self._column.to_pandas(nullable=nullable, arrow_type=arrow_type), + index=index, + name=self.name, ) - s.name = self.name - return s @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index f98c3ad0475..06d63561fc1 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -175,12 +175,12 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): a = cudf.Series(cupy.asarray([1, 2, 3]))._column a = cudf.core.column.as_column(a) b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Series([1, 2, 3]), a.to_pandas()) + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) a = cudf.Series(cupy.asarray([1, 2, 3]))._column a.name = "b" b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Series([1, 2, 3]), a.to_pandas()) + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) @pytest.mark.xfail( From 22ef0634f07f7b40d718e80bed176e88ac734ebe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:58:11 -1000 Subject: [PATCH 11/48] Remove internal usage of core.index.as_index in favor of cudf.Index (#15851) `cudf.Index.__init__` essentially calls `as_index` immediately internally. To avoid both from potentially diverging, the public `cudf.Index` should be preferred to ensure the public behaviors are used internally Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15851 --- python/cudf/cudf/core/algorithms.py | 4 +- python/cudf/cudf/core/column/methods.py | 4 +- python/cudf/cudf/core/column/string.py | 4 +- python/cudf/cudf/core/cut.py | 4 +- python/cudf/cudf/core/dataframe.py | 36 +++++++-------- python/cudf/cudf/core/dtypes.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 6 +-- python/cudf/cudf/core/index.py | 30 +++++-------- python/cudf/cudf/core/indexed_frame.py | 4 +- python/cudf/cudf/core/multiindex.py | 7 +-- python/cudf/cudf/core/series.py | 8 ++-- python/cudf/cudf/core/tools/datetimes.py | 5 +-- python/cudf/cudf/tests/test_array_function.py | 4 +- python/cudf/cudf/tests/test_binops.py | 31 +++++++------ python/cudf/cudf/tests/test_contains.py | 6 +-- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_index.py | 44 ++++++++----------- python/cudf/cudf/tests/test_multiindex.py | 7 +-- python/cudf/cudf/tests/test_string.py | 38 ++++++++-------- .../cudf/cudf/tests/text/test_text_methods.py | 8 ++-- 20 files changed, 116 insertions(+), 140 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 272abdece9e..51a32e29886 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -6,7 +6,7 @@ from cudf.core.column import as_column from cudf.core.copy_types import BooleanMask -from cudf.core.index import RangeIndex, as_index +from cudf.core.index import Index, RangeIndex from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar from cudf.options import get_option @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else as_index(cats) + return labels, cats.values if return_cupy_array else Index(cats) def _linear_interpolation(column, index=None): diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index e827c7a3dd3..7f7355c571a 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -93,8 +93,6 @@ def _return_or_inplace( else: return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index( - new_col, name=self._parent.name - ) + return cudf.Index(new_col, name=self._parent.name) else: return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fd98d0dc163..d12aa80e9a3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4391,7 +4391,7 @@ def code_points(self) -> SeriesOrIndex: if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index(new_col, name=self._parent.name) + return cudf.Index(new_col, name=self._parent.name) else: return new_col @@ -4706,7 +4706,7 @@ def character_tokenize(self) -> SeriesOrIndex: index = self._parent.index.repeat(lengths) return cudf.Series(result_col, name=self._parent.name, index=index) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.core.index.as_index(result_col, name=self._parent.name) + return cudf.Index(result_col, name=self._parent.name) else: return result_col diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index ccf730c91fb..54c5e829e8a 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from collections import abc @@ -292,7 +292,7 @@ def cut( ) # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.core.index.as_index(col) + categorical_index = cudf.Index(col) if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4c55b5427de..c8f1e872300 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -712,7 +712,7 @@ def __init__( data = data.reindex(index) index = data.index else: - index = as_index(index) + index = cudf.Index(index) else: index = data.index @@ -761,7 +761,7 @@ def __init__( if index is None: self._index = RangeIndex(0) else: - self._index = as_index(index) + self._index = cudf.Index(index) if columns is not None: rangeindex = isinstance( columns, (range, pd.RangeIndex, cudf.RangeIndex) @@ -875,7 +875,7 @@ def _init_from_series_list(self, data, columns, index): # When `index` is `None`, the final index of # resulting dataframe will be union of # all Series's names. - final_index = as_index(_get_union_of_series_names(data)) + final_index = cudf.Index(_get_union_of_series_names(data)) else: # When an `index` is passed, the final index of # resulting dataframe will be whatever @@ -919,7 +919,7 @@ def _init_from_series_list(self, data, columns, index): f"not match length of index ({index_length})" ) - final_index = as_index(index) + final_index = cudf.Index(index) series_lengths = list(map(len, data)) data = numeric_normalize_types(*data) @@ -943,7 +943,7 @@ def _init_from_series_list(self, data, columns, index): # Setting `final_columns` to self._index so # that the resulting `transpose` will be have # columns set to `final_columns` - self._index = as_index(final_columns) + self._index = cudf.Index(final_columns) transpose = self.T else: @@ -987,9 +987,9 @@ def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) else: - index = as_index(index) + index = cudf.Index(index) - self._index = as_index(index) + self._index = cudf.Index(index) # list-of-dicts case if len(data) > 0 and isinstance(data[0], dict): data = DataFrame.from_pandas(pd.DataFrame(data)) @@ -1095,7 +1095,7 @@ def _init_from_dict_like( self._index = RangeIndex(0, num_rows) else: - self._index = as_index(index) + self._index = cudf.Index(index) if len(data): self._data.multiindex = True @@ -1410,7 +1410,7 @@ def __setitem__(self, arg, value): new_columns, verify=False ) if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) + self._index = cudf.Index(value.index) elif len(value) > 0: self._index = RangeIndex(length) return @@ -1728,7 +1728,7 @@ def _concat( for cols in columns: table_index = None if 1 == first_data_column_position: - table_index = cudf.core.index.as_index(cols[0]) + table_index = cudf.Index(cols[0]) elif first_data_column_position > 1: table_index = DataFrame._from_data( data=dict( @@ -1780,9 +1780,7 @@ def _concat( if not isinstance(out.index, MultiIndex) and isinstance( out.index.dtype, cudf.CategoricalDtype ): - out = out.set_index( - cudf.core.index.as_index(out.index._values) - ) + out = out.set_index(cudf.Index(out.index._values)) for name, col in out._data.items(): out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype @@ -2828,7 +2826,7 @@ def reindex( if columns is None: df = self else: - columns = as_index(columns) + columns = cudf.Index(columns) intersection = self._data.to_pandas_index().intersection( columns.to_pandas() ) @@ -3245,7 +3243,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if len(self) == 0: if isinstance(value, (pd.Series, Series)): if not ignore_index: - self.index = as_index(value.index) + self.index = cudf.Index(value.index) elif (length := len(value)) > 0: if num_cols != 0: ca = self._data._from_columns_like_self( @@ -5654,7 +5652,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): } if not is_scalar(index): - new_index = as_index(index) + new_index = cudf.Index(index) else: new_index = None @@ -5738,7 +5736,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): } if index is not None: - index = as_index(index) + index = cudf.Index(index) if isinstance(columns, (pd.Index, cudf.Index)): level_names = tuple(columns.names) @@ -6171,7 +6169,7 @@ def count(self, axis=0, numeric_only=False): for col in self._data.names ] }, - as_index(self._data.names), + cudf.Index(self._data.names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6298,7 +6296,7 @@ def _reduce( source._data.names, names=source._data.level_names ) else: - idx = as_index(source._data.names) + idx = cudf.Index(source._data.names) return Series._from_data({None: as_column(result)}, idx) elif axis == 1: return source._apply_cupy_method_axis_1(op, **kwargs) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 9bb1995b836..4729233ee6e 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -186,10 +186,10 @@ def categories(self) -> "cudf.core.index.Index": Index(['b', 'a'], dtype='object') """ if self._categories is None: - return cudf.core.index.as_index( + return cudf.Index( cudf.core.column.column_empty(0, dtype="object", masked=False) ) - return cudf.core.index.as_index(self._categories, copy=False) + return cudf.Index(self._categories, copy=False) @property def type(self): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3e7a1ee6026..ac8b381cbec 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2800,15 +2800,13 @@ def keys(self): nkeys = len(self._key_columns) if nkeys == 0: - return cudf.core.index.as_index([], name=None) + return cudf.Index([], name=None) elif nkeys > 1: return cudf.MultiIndex._from_data( dict(zip(range(nkeys), self._key_columns)) )._set_names(self.names) else: - return cudf.core.index.as_index( - self._key_columns[0], name=self.names[0] - ) + return cudf.Index(self._key_columns[0], name=self.names[0]) @property def values(self) -> cudf.core.frame.Frame: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 4b09765fa46..7297ac4e929 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1169,7 +1169,7 @@ def _concat(cls, objs): result = _concat_range_index(non_empties) else: data = concat_columns([o._values for o in non_empties]) - result = as_index(data) + result = Index(data) names = {obj.name for obj in objs} if len(names) == 1: @@ -1437,7 +1437,7 @@ def __repr__(self): def __getitem__(self, index): res = self._get_elements_from_column(index) if isinstance(res, ColumnBase): - res = as_index(res, name=self.name) + res = Index(res, name=self.name) return res @property # type: ignore @@ -1958,7 +1958,7 @@ def microsecond(self): >>> datetime_index.microsecond Index([0, 1, 2], dtype='int32') """ # noqa: E501 - return as_index( + return Index( ( # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this @@ -2209,7 +2209,7 @@ def _get_dt_field(self, field): mask=out_column.base_mask, offset=out_column.offset, ) - return as_index(out_column, name=self.name) + return Index(out_column, name=self.name) def _is_boolean(self): return False @@ -2522,9 +2522,7 @@ def days(self): Number of days for each element. """ # Need to specifically return `int64` to avoid overflow. - return as_index( - arbitrary=self._values.days, name=self.name, dtype="int64" - ) + return Index(self._values.days, name=self.name, dtype="int64") @property # type: ignore @_cudf_nvtx_annotate @@ -2532,9 +2530,7 @@ def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return as_index( - arbitrary=self._values.seconds, name=self.name, dtype="int32" - ) + return Index(self._values.seconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2542,9 +2538,7 @@ def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return as_index( - arbitrary=self._values.microseconds, name=self.name, dtype="int32" - ) + return Index(self._values.microseconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2553,9 +2547,7 @@ def nanoseconds(self): Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return as_index( - arbitrary=self._values.nanoseconds, name=self.name, dtype="int32" - ) + return Index(self._values.nanoseconds, name=self.name, dtype="int32") @property # type: ignore @_cudf_nvtx_annotate @@ -2693,7 +2685,7 @@ def codes(self): """ The category codes of this categorical. """ - return as_index(self._values.codes) + return Index(self._values.codes) @property # type: ignore @_cudf_nvtx_annotate @@ -3137,7 +3129,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: elif step is None: # First non-empty index had only one element if obj.start == start: - result = as_index(concat_columns([x._values for x in indexes])) + result = Index(concat_columns([x._values for x in indexes])) return result step = obj.start - start @@ -3145,7 +3137,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: next_ is not None and obj.start != next_ ) if non_consecutive: - result = as_index(concat_columns([x._values for x in indexes])) + result = Index(concat_columns([x._values for x in indexes])) return result if step is not None: next_ = obj[-1] + step diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5a466f20f8c..688b268d478 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3638,7 +3638,7 @@ def _align_to_index( sort: bool = True, allow_non_unique: bool = False, ) -> Self: - index = cudf.core.index.as_index(index) + index = cudf.Index(index) if self.index.equals(index): return self @@ -3713,7 +3713,7 @@ def _reindex( raise ValueError( "cannot reindex on an axis with duplicate labels" ) - index = cudf.core.index.as_index( + index = cudf.Index( index, name=getattr(index, "name", self.index.name) ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 049fac45ba8..11b4b9154a2 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -30,7 +30,6 @@ BaseIndex, _get_indexer_basic, _lexsorted_equal_range, - as_index, ) from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.dtypes import is_column_like @@ -824,7 +823,7 @@ def _index_and_downcast(self, result, index, index_key): # it into an Index and name the final index values according # to that column's name. *_, last_column = index._data.columns - out_index = as_index(last_column) + out_index = cudf.Index(last_column) out_index.name = index.names[-1] index = out_index elif out_index._num_columns > 1: @@ -1082,7 +1081,9 @@ def get_level_values(self, level): raise KeyError(f"Level not found: '{level}'") else: level_idx = colnames.index(level) - level_values = as_index(self._data[level], name=self.names[level_idx]) + level_values = cudf.Index( + self._data[level], name=self.names[level_idx] + ) return level_values def _is_numeric(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 169f7c11cf9..a52b583d3b4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -596,7 +596,7 @@ def __init__( name_from_data = data.name column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, pd.Series): - index_from_data = as_index(data.index) + index_from_data = cudf.Index(data.index) elif isinstance(data, Series): index_from_data = data.index elif isinstance(data, ColumnAccessor): @@ -612,7 +612,7 @@ def __init__( column = as_column( list(data.values()), nan_as_null=nan_as_null, dtype=dtype ) - index_from_data = as_index(list(data.keys())) + index_from_data = cudf.Index(list(data.keys())) else: # Using `getattr_static` to check if # `data` is on device memory and perform @@ -649,7 +649,7 @@ def __init__( name = name_from_data if index is not None: - index = as_index(index) + index = cudf.Index(index) if index_from_data is not None: first_index = index_from_data @@ -5241,7 +5241,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): if isinstance(a, cudf.Series) and isinstance(b, cudf.Series): b = b.reindex(a.index) - index = as_index(a.index) + index = cudf.Index(a.index) a_col = as_column(a) a_array = cupy.asarray(a_col.data_array_view(mode="read")) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 12a1ecc68e0..f002a838fa9 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -18,7 +18,6 @@ ) from cudf.api.types import is_integer, is_scalar from cudf.core import column -from cudf.core.index import as_index # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { @@ -287,13 +286,13 @@ def to_datetime( utc=utc, ) if isinstance(arg, (cudf.BaseIndex, pd.Index)): - return as_index(col, name=arg.name) + return cudf.Index(col, name=arg.name) elif isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col, index=arg.index, name=arg.name) elif is_scalar(arg): return col.element_indexing(0) else: - return as_index(col) + return cudf.Index(col) except Exception as e: if errors == "raise": raise e diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 58939f0ddd9..e6b89e2c5fa 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -108,7 +108,7 @@ def test_array_func_missing_cudf_dataframe(pd_df, func): ], ) def test_array_func_cudf_index(np_ar, func): - cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) + cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) if np.isscalar(expect): @@ -128,7 +128,7 @@ def test_array_func_cudf_index(np_ar, func): ], ) def test_array_func_missing_cudf_index(np_ar, func): - cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) + cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 5d0c403daa2..fa371914c3e 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -12,10 +12,9 @@ import pytest import cudf -from cudf import Series +from cudf import Index, Series from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.buffer.spill_manager import get_global_manager -from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( BOOL_TYPES, @@ -186,8 +185,8 @@ def test_series_binop(binop, obj_class): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(sr1, sr2) expect = binop(pd.Series(arr1), pd.Series(arr2)) @@ -225,7 +224,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): sr = Series(arr) if obj_class == "Index": - sr = as_index(sr) + sr = Index(sr) if use_cudf_scalar: result = binop(sr, rhs) @@ -251,8 +250,8 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(sr1, sr2) @@ -274,8 +273,8 @@ def test_series_compare(cmpop, obj_class, dtype): sr2 = Series(arr2) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result1 = cmpop(sr1, sr1) result2 = cmpop(sr2, sr2) @@ -402,7 +401,7 @@ def test_series_compare_scalar( rhs = cudf.Scalar(rhs) if obj_class == "Index": - sr1 = as_index(sr1) + sr1 = Index(sr1) result1 = cmpop(sr1, rhs) result2 = cmpop(rhs, sr1) @@ -488,8 +487,8 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): sr2 = Series(rhs) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = binop(Series(sr1), Series(sr2)) @@ -513,8 +512,8 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): sr2 = Series(rhs) if obj_class == "Index": - sr1 = as_index(sr1) - sr2 = as_index(sr2) + sr1 = Index(sr1) + sr2 = Index(sr2) result = cmpop(Series(sr1), Series(sr2)) @@ -538,7 +537,7 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): # class typing if obj_class == "Index": - gs = as_index(gs) + gs = Index(gs) gs_result = func(gs) @@ -588,7 +587,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): # class typing if obj_class == "Index": - gs = as_index(gs) + gs = Index(gs) gs_result = gpu_func(gs) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 15dfa111860..a65ab1780b6 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime @@ -8,7 +8,7 @@ import cudf from cudf import Series -from cudf.core.index import RangeIndex, as_index +from cudf.core.index import Index, RangeIndex from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -74,7 +74,7 @@ def test_series_contains(values, item, expected): @pytest.mark.parametrize("values, item, expected", testdata_all) def test_index_contains(values, item, expected): - index = as_index(values) + index = Index(values) assert_eq(expected, item in index) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index aafe920d3a1..7ea3979b0f1 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -101,7 +101,7 @@ def test_to_dlpack_index(data_1d): with expectation: if np.isnan(data_1d).any(): pytest.skip("Nulls not allowed in Index") - gi = cudf.core.index.as_index(data_1d) + gi = cudf.Index(data_1d) dlt = gi.to_dlpack() # PyCapsules are a C-API thing so couldn't come up with a better way diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index b92ae1b3364..3d6c71ebc1b 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -17,13 +17,7 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype -from cudf.core.index import ( - CategoricalIndex, - DatetimeIndex, - Index, - RangeIndex, - as_index, -) +from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing._utils import ( ALL_TYPES, FLOAT_TYPES, @@ -200,11 +194,11 @@ def test_pandas_as_index(): pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) # Define cudf Indexes - gdf_int_index = as_index(pdf_int_index) - gdf_uint_index = as_index(pdf_uint_index) - gdf_float_index = as_index(pdf_float_index) - gdf_datetime_index = as_index(pdf_datetime_index) - gdf_category_index = as_index(pdf_category_index) + gdf_int_index = Index(pdf_int_index) + gdf_uint_index = Index(pdf_uint_index) + gdf_float_index = Index(pdf_float_index) + gdf_datetime_index = Index(pdf_datetime_index) + gdf_category_index = Index(pdf_category_index) # Check instance types assert isinstance(gdf_int_index, Index) @@ -232,7 +226,7 @@ def test_pandas_as_index(): @pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) def test_index_rename(initial_name, name): pds = pd.Index([1, 2, 3], name=initial_name) - gds = as_index(pds) + gds = Index(pds) assert_eq(pds, gds) @@ -245,18 +239,18 @@ def test_index_rename(initial_name, name): and if name is being handles in recursive creation. """ pds = pd.Index(expect) - gds = as_index(got) + gds = Index(got) assert_eq(pds, gds) pds = pd.Index(pds, name="abc") - gds = as_index(gds, name="abc") + gds = Index(gds, name="abc") assert_eq(pds, gds) def test_index_rename_inplace(): pds = pd.Index([1, 2, 3], name="asdf") - gds = as_index(pds) + gds = Index(pds) # inplace=False should yield a deep copy gds_renamed_deep = gds.rename("new_name", inplace=False) @@ -280,7 +274,7 @@ def test_index_rename_preserves_arg(): assert idx1.name == "orig_name" # a new object but referencing the same data - idx3 = as_index(idx1, name="last_name") + idx3 = Index(idx1, name="last_name") assert idx3.name == "last_name" assert idx1.name == "orig_name" @@ -456,7 +450,7 @@ def test_from_pandas_gen(): def test_index_names(): - idx = cudf.core.index.as_index([1, 2, 3], name="idx") + idx = Index([1, 2, 3], name="idx") assert idx.names == ("idx",) @@ -874,8 +868,8 @@ def test_index_equals(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data) + gd_other = Index(other) expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) @@ -920,8 +914,8 @@ def test_index_categories_equal(data, other): pd_data = pd.Index(data).astype("category") pd_other = pd.Index(other) - gd_data = cudf.core.index.as_index(data).astype("category") - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data).astype("category") + gd_other = Index(other) expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) @@ -970,7 +964,7 @@ def test_index_equal_misc(data, other): pd_data = pd.Index(data) pd_other = other - gd_data = cudf.core.index.as_index(data) + gd_data = Index(data) gd_other = other expected = pd_data.equals(pd_other) @@ -1089,8 +1083,8 @@ def test_index_empty_append_name_conflict(): ], ) def test_index_append_error(data, other): - gd_data = cudf.core.index.as_index(data) - gd_other = cudf.core.index.as_index(other) + gd_data = Index(data) + gd_other = Index(other) got_dtype = ( gd_other.dtype diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index dd731fab8f3..f143112a45f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -21,7 +21,6 @@ import cudf from cudf.api.extensions import no_default from cudf.core.column import as_column -from cudf.core.index import as_index from cudf.testing._utils import ( assert_eq, assert_exceptions_equal, @@ -158,8 +157,6 @@ def test_multiindex_swaplevel(): def test_string_index(): - from cudf.core.index import Index - pdf = pd.DataFrame(np.random.rand(5, 5)) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] @@ -170,11 +167,11 @@ def test_string_index(): pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = Index(["a", "b", "c", "d", "e"], name="name") + stringIndex = cudf.Index(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name") + stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name") pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index de771a56e77..801c530da43 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -16,7 +16,7 @@ import cudf from cudf import concat from cudf.core.column.string import StringColumn -from cudf.core.index import Index, as_index +from cudf.core.index import Index from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -1500,7 +1500,7 @@ def test_strings_partition(data): assert_eq(ps.str.partition(","), gs.str.partition(",")) assert_eq(ps.str.partition("-"), gs.str.partition("-")) - gi = as_index(data, name="new name") + gi = cudf.Index(data, name="new name") pi = pd.Index(data, name="new name") assert_eq(pi.str.partition(), gi.str.partition()) assert_eq(pi.str.partition(","), gi.str.partition(",")) @@ -1639,7 +1639,7 @@ def test_strings_strip_tests(data, to_strip): ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) @@ -1696,7 +1696,7 @@ def test_strings_filling_tests(data, width, fillchar): gs.str.rjust(width=width, fillchar=fillchar), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1731,7 +1731,7 @@ def test_strings_zfill_tests(data, width): assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width)) @@ -1763,7 +1763,7 @@ def test_strings_pad_tests(data, width, side, fillchar): gs.str.pad(width=width, side=side, fillchar=fillchar), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1807,7 +1807,7 @@ def test_string_wrap(data, width): ), ) - gi = as_index(data) + gi = cudf.Index(data) pi = pd.Index(data) assert_eq( @@ -1941,7 +1941,7 @@ def test_string_replace_with_backrefs(find, replace): expected = ps.str.replace(find, replace, regex=True) assert_eq(got, expected) - got = as_index(gs).str.replace_with_backrefs(find, replace) + got = cudf.Index(gs).str.replace_with_backrefs(find, replace) expected = pd.Index(ps).str.replace(find, replace, regex=True) assert_eq(got, expected) @@ -2227,7 +2227,7 @@ def test_string_str_rindex(data, sub, er): assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) assert_eq( pd.Index(ps).str.rindex(sub), - as_index(gs).str.rindex(sub), + cudf.Index(gs).str.rindex(sub), exact=False, ) @@ -2336,7 +2336,7 @@ def test_string_str_match(data, pat): assert_eq(ps.str.match(pat), gs.str.match(pat)) assert_eq( - pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat) + pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat) ) @@ -2363,7 +2363,7 @@ def test_string_str_translate(data): ) assert_eq( pd.Index(ps).str.translate(str.maketrans({"a": "z"})), - as_index(gs).str.translate(str.maketrans({"a": "z"})), + cudf.Index(gs).str.translate(str.maketrans({"a": "z"})), ) assert_eq( ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), @@ -2373,7 +2373,7 @@ def test_string_str_translate(data): pd.Index(ps).str.translate( str.maketrans({"a": "z", "i": "$", "z": "1"}) ), - as_index(gs).str.translate( + cudf.Index(gs).str.translate( str.maketrans({"a": "z", "i": "$", "z": "1"}) ), ) @@ -2389,7 +2389,7 @@ def test_string_str_translate(data): pd.Index(ps).str.translate( str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) ), - as_index(gs).str.translate( + cudf.Index(gs).str.translate( str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) ), ) @@ -2779,8 +2779,8 @@ def test_string_str_byte_count(data, expected): actual = sr.str.byte_count() assert_eq(expected, actual) - si = as_index(data) - expected = as_index(expected, dtype="int32") + si = cudf.Index(data) + expected = cudf.Index(expected, dtype="int32") actual = si.str.byte_count() assert_eq(expected, actual) @@ -2828,8 +2828,8 @@ def test_str_isinteger(data, expected): actual = sr.str.isinteger() assert_eq(expected, actual) - sr = as_index(data) - expected = as_index(expected) + sr = cudf.Index(data) + expected = cudf.Index(expected) actual = sr.str.isinteger() assert_eq(expected, actual) @@ -2884,8 +2884,8 @@ def test_str_isfloat(data, expected): actual = sr.str.isfloat() assert_eq(expected, actual) - sr = as_index(data) - expected = as_index(expected) + sr = cudf.Index(data) + expected = cudf.Index(expected) actual = sr.str.isfloat() assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6ecead862bb..6bd3b99bae1 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -539,7 +539,7 @@ def test_character_tokenize_series(): def test_character_tokenize_index(): - sr = cudf.core.index.as_index( + sr = cudf.Index( [ "hello world", "sdf", @@ -550,7 +550,7 @@ def test_character_tokenize_index(): ), ] ) - expected = cudf.core.index.as_index( + expected = cudf.Index( [ "h", "e", @@ -648,8 +648,8 @@ def test_character_tokenize_index(): actual = sr.str.character_tokenize() assert_eq(expected, actual) - sr = cudf.core.index.as_index(["a"]) - expected = cudf.core.index.as_index(["a"]) + sr = cudf.Index(["a"]) + expected = cudf.Index(["a"]) actual = sr.str.character_tokenize() assert_eq(expected, actual) From dc829b8372487615b74494a19c63d43cdbdb0d79 Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Wed, 5 Jun 2024 10:10:11 -0400 Subject: [PATCH 12/48] Update Changelog [skip ci] --- CHANGELOG.md | 306 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ecad2c9c39..a5efe4eb9e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,309 @@ +# cudf 24.06.00 (5 Jun 2024) + +## 🚨 Breaking Changes + +- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar) +- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1) +- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123) +- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar) +- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice) +- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice) +- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123) +- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism) +- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt) +- Floating <--> fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt) +- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt) +- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar) +- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke) + +## 🐛 Bug Fixes + +- Revert "Fix docs for IO readers and strings_convert" ([#15872](https://github.com/rapidsai/cudf/pull/15872)) [@vyasr](https://github.com/vyasr) +- Remove problematic call of index setter to unblock dask-cuda CI ([#15844](https://github.com/rapidsai/cudf/pull/15844)) [@charlesbluca](https://github.com/charlesbluca) +- Use rapids_cpm_nvtx3 to get same nvtx3 target state as rmm ([#15840](https://github.com/rapidsai/cudf/pull/15840)) [@robertmaynard](https://github.com/robertmaynard) +- Return boolean from config_host_memory_resource instead of throwing ([#15815](https://github.com/rapidsai/cudf/pull/15815)) [@abellina](https://github.com/abellina) +- Add temporary dask-cudf workaround for categorical sorting ([#15801](https://github.com/rapidsai/cudf/pull/15801)) [@rjzamora](https://github.com/rjzamora) +- Fix row group alignment in ORC writer ([#15789](https://github.com/rapidsai/cudf/pull/15789)) [@vuule](https://github.com/vuule) +- Raise error when sorting by categorical column in dask-cudf ([#15788](https://github.com/rapidsai/cudf/pull/15788)) [@rjzamora](https://github.com/rjzamora) +- Upgrade `arrow` to 16.1 ([#15787](https://github.com/rapidsai/cudf/pull/15787)) [@galipremsagar](https://github.com/galipremsagar) +- Add support for `PandasArray` for `pandas<2.1.0` ([#15786](https://github.com/rapidsai/cudf/pull/15786)) [@galipremsagar](https://github.com/galipremsagar) +- Limit runtime dependency to `libarrow>=16.0.0,<16.1.0a0` ([#15782](https://github.com/rapidsai/cudf/pull/15782)) [@pentschev](https://github.com/pentschev) +- Fix cat.as_ordered not propogating correct size ([#15780](https://github.com/rapidsai/cudf/pull/15780)) [@mroeschke](https://github.com/mroeschke) +- Handle mixed-like homogeneous types in `isin` ([#15771](https://github.com/rapidsai/cudf/pull/15771)) [@galipremsagar](https://github.com/galipremsagar) +- Fix id_vars and value_vars not accepting string scalars in melt ([#15765](https://github.com/rapidsai/cudf/pull/15765)) [@mroeschke](https://github.com/mroeschke) +- Fix `DatetimeIndex.loc` for all types of ordering cases ([#15761](https://github.com/rapidsai/cudf/pull/15761)) [@galipremsagar](https://github.com/galipremsagar) +- Fix arrow versioning logic ([#15755](https://github.com/rapidsai/cudf/pull/15755)) [@vyasr](https://github.com/vyasr) +- Avoid running sanitizer on Java test designed to cause an error ([#15753](https://github.com/rapidsai/cudf/pull/15753)) [@jlowe](https://github.com/jlowe) +- Handle empty dataframe object with index present in setitem of `loc` ([#15752](https://github.com/rapidsai/cudf/pull/15752)) [@galipremsagar](https://github.com/galipremsagar) +- Eliminate circular reference in DataFrame/Series.iloc/loc ([#15749](https://github.com/rapidsai/cudf/pull/15749)) [@mroeschke](https://github.com/mroeschke) +- Cap the absolute row index per pass in parquet chunked reader. ([#15735](https://github.com/rapidsai/cudf/pull/15735)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix `Index.repeat` for `datetime64` types ([#15722](https://github.com/rapidsai/cudf/pull/15722)) [@galipremsagar](https://github.com/galipremsagar) +- Fix multibyte check for case convert for large strings ([#15721](https://github.com/rapidsai/cudf/pull/15721)) [@davidwendt](https://github.com/davidwendt) +- Fix `get_loc` to properly fetch results from an index that is in decreasing order ([#15719](https://github.com/rapidsai/cudf/pull/15719)) [@galipremsagar](https://github.com/galipremsagar) +- Return same type as the original index for `.loc` operations ([#15717](https://github.com/rapidsai/cudf/pull/15717)) [@galipremsagar](https://github.com/galipremsagar) +- Correct static builds + static arrow ([#15715](https://github.com/rapidsai/cudf/pull/15715)) [@robertmaynard](https://github.com/robertmaynard) +- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar) +- Fix ColumnAccessor caching of nrows if empty previously ([#15710](https://github.com/rapidsai/cudf/pull/15710)) [@mroeschke](https://github.com/mroeschke) +- Allow `None` when `nan_as_null=False` in column constructor ([#15709](https://github.com/rapidsai/cudf/pull/15709)) [@galipremsagar](https://github.com/galipremsagar) +- Refine `CudaTest.testCudaException` in case throwing wrong type of CudaError under aarch64 ([#15706](https://github.com/rapidsai/cudf/pull/15706)) [@sperlingxx](https://github.com/sperlingxx) +- Fix maxima of categorical column ([#15701](https://github.com/rapidsai/cudf/pull/15701)) [@rjzamora](https://github.com/rjzamora) +- Add proxy for inplace operations in `cudf.pandas` ([#15695](https://github.com/rapidsai/cudf/pull/15695)) [@galipremsagar](https://github.com/galipremsagar) +- Make `nan_as_null` behavior consistent across all APIs ([#15692](https://github.com/rapidsai/cudf/pull/15692)) [@galipremsagar](https://github.com/galipremsagar) +- Fix CI s3 api command to fetch latest results ([#15687](https://github.com/rapidsai/cudf/pull/15687)) [@galipremsagar](https://github.com/galipremsagar) +- Add `NumpyExtensionArray` proxy type in `cudf.pandas` ([#15686](https://github.com/rapidsai/cudf/pull/15686)) [@galipremsagar](https://github.com/galipremsagar) +- Properly implement binaryops for proxy types ([#15684](https://github.com/rapidsai/cudf/pull/15684)) [@galipremsagar](https://github.com/galipremsagar) +- Fix copy assignment and the comparison operator of `rmm_host_allocator` ([#15677](https://github.com/rapidsai/cudf/pull/15677)) [@vuule](https://github.com/vuule) +- Fix multi-source reading in JSON byte range reader ([#15671](https://github.com/rapidsai/cudf/pull/15671)) [@shrshi](https://github.com/shrshi) +- Return `int64` when pandas compatible mode is turned on for `get_indexer` ([#15659](https://github.com/rapidsai/cudf/pull/15659)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Index contains for error validations and float vs int comparisons ([#15657](https://github.com/rapidsai/cudf/pull/15657)) [@galipremsagar](https://github.com/galipremsagar) +- Preserve sub-second data for time scalars in column construction ([#15655](https://github.com/rapidsai/cudf/pull/15655)) [@galipremsagar](https://github.com/galipremsagar) +- Check row limit size in cudf::strings::join_strings ([#15643](https://github.com/rapidsai/cudf/pull/15643)) [@davidwendt](https://github.com/davidwendt) +- Enable sorting on column with nulls using query-planning ([#15639](https://github.com/rapidsai/cudf/pull/15639)) [@rjzamora](https://github.com/rjzamora) +- Fix operator precedence problem in Parquet reader ([#15638](https://github.com/rapidsai/cudf/pull/15638)) [@etseidl](https://github.com/etseidl) +- Fix decoding of dictionary encoded FIXED_LEN_BYTE_ARRAY data in Parquet reader ([#15601](https://github.com/rapidsai/cudf/pull/15601)) [@etseidl](https://github.com/etseidl) +- Fix debug warnings/errors in from_arrow_device_test.cpp ([#15596](https://github.com/rapidsai/cudf/pull/15596)) [@davidwendt](https://github.com/davidwendt) +- Add "collect" aggregation support to dask-cudf ([#15593](https://github.com/rapidsai/cudf/pull/15593)) [@rjzamora](https://github.com/rjzamora) +- Fix categorical-accessor support and testing in dask-cudf ([#15591](https://github.com/rapidsai/cudf/pull/15591)) [@rjzamora](https://github.com/rjzamora) +- Disable compute-sanitizer usage in CI tests with CUDA<11.6 ([#15584](https://github.com/rapidsai/cudf/pull/15584)) [@davidwendt](https://github.com/davidwendt) +- Preserve RangeIndex.step in to_arrow/from_arrow ([#15581](https://github.com/rapidsai/cudf/pull/15581)) [@mroeschke](https://github.com/mroeschke) +- Ignore new cupy warning ([#15574](https://github.com/rapidsai/cudf/pull/15574)) [@vyasr](https://github.com/vyasr) +- Add cuda-sanitizer-api dependency for test-cpp matrix 11.4 ([#15573](https://github.com/rapidsai/cudf/pull/15573)) [@davidwendt](https://github.com/davidwendt) +- Allow apply udf to reference global modules in cudf.pandas ([#15569](https://github.com/rapidsai/cudf/pull/15569)) [@mroeschke](https://github.com/mroeschke) +- Fix deprecation warnings for json legacy reader ([#15563](https://github.com/rapidsai/cudf/pull/15563)) [@davidwendt](https://github.com/davidwendt) +- Fix millisecond resampling in cudf Python ([#15560](https://github.com/rapidsai/cudf/pull/15560)) [@mroeschke](https://github.com/mroeschke) +- Rename JSON_READER_OPTION to JSON_READER_OPTION_NVBENCH. ([#15553](https://github.com/rapidsai/cudf/pull/15553)) [@bdice](https://github.com/bdice) +- Fix a JNI bug in JSON parsing fixup ([#15550](https://github.com/rapidsai/cudf/pull/15550)) [@revans2](https://github.com/revans2) +- Remove conda channel setup from wheel CI image script. ([#15539](https://github.com/rapidsai/cudf/pull/15539)) [@bdice](https://github.com/bdice) +- cudf.pandas: Series dt accessor is CombinedDatetimelikeProperties ([#15523](https://github.com/rapidsai/cudf/pull/15523)) [@wence-](https://github.com/wence-) +- Fix for some compiler warnings in parquet/page_decode.cuh ([#15518](https://github.com/rapidsai/cudf/pull/15518)) [@etseidl](https://github.com/etseidl) +- Fix exponent overflow in strings-to-double conversion ([#15517](https://github.com/rapidsai/cudf/pull/15517)) [@davidwendt](https://github.com/davidwendt) +- nanoarrow uses package override for proper pinned versions generation ([#15515](https://github.com/rapidsai/cudf/pull/15515)) [@robertmaynard](https://github.com/robertmaynard) +- Remove index name overrides in dask-cudf pyarrow table dispatch ([#15514](https://github.com/rapidsai/cudf/pull/15514)) [@charlesbluca](https://github.com/charlesbluca) +- Fix async synchronization issues in json_column.cu ([#15497](https://github.com/rapidsai/cudf/pull/15497)) [@karthikeyann](https://github.com/karthikeyann) +- Add new patch to hide more CCCL APIs ([#15493](https://github.com/rapidsai/cudf/pull/15493)) [@vyasr](https://github.com/vyasr) +- Make improvements in pandas-test reporting ([#15485](https://github.com/rapidsai/cudf/pull/15485)) [@galipremsagar](https://github.com/galipremsagar) +- Fixed page data truncation in parquet writer under certain conditions. ([#15474](https://github.com/rapidsai/cudf/pull/15474)) [@nvdbaranec](https://github.com/nvdbaranec) +- Only use data_type constructor with scale for decimal types ([#15472](https://github.com/rapidsai/cudf/pull/15472)) [@wence-](https://github.com/wence-) +- Avoid "p2p" shuffle as a default when `dask_cudf` is imported ([#15469](https://github.com/rapidsai/cudf/pull/15469)) [@rjzamora](https://github.com/rjzamora) +- Fix debug build errors from to_arrow_device_test.cpp ([#15463](https://github.com/rapidsai/cudf/pull/15463)) [@davidwendt](https://github.com/davidwendt) +- Fix base_normalator::integer_sizeof_fn integer dispatch ([#15457](https://github.com/rapidsai/cudf/pull/15457)) [@davidwendt](https://github.com/davidwendt) +- Allow consumers of static builds to find nanoarrow ([#15456](https://github.com/rapidsai/cudf/pull/15456)) [@robertmaynard](https://github.com/robertmaynard) +- Allow jit compilation when using a splayed CUDA toolkit ([#15451](https://github.com/rapidsai/cudf/pull/15451)) [@robertmaynard](https://github.com/robertmaynard) +- Handle case of scan aggregation in groupby-transform ([#15450](https://github.com/rapidsai/cudf/pull/15450)) [@wence-](https://github.com/wence-) +- Test static builds in CI and fix nanoarrow configure ([#15437](https://github.com/rapidsai/cudf/pull/15437)) [@vyasr](https://github.com/vyasr) +- Fixes potential race in JSON parser when parsing JSON lines format and when recovering from invalid lines ([#15419](https://github.com/rapidsai/cudf/pull/15419)) [@elstehle](https://github.com/elstehle) +- Fix errors in chunked ORC writer when no tables were (successfully) written ([#15393](https://github.com/rapidsai/cudf/pull/15393)) [@vuule](https://github.com/vuule) +- Support implicit array conversion with query-planning enabled ([#15378](https://github.com/rapidsai/cudf/pull/15378)) [@rjzamora](https://github.com/rjzamora) +- Fix arrow-based round trip of empty dataframes ([#15373](https://github.com/rapidsai/cudf/pull/15373)) [@wence-](https://github.com/wence-) +- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt) +- Remove boundscheck=False setting in cython files ([#15362](https://github.com/rapidsai/cudf/pull/15362)) [@wence-](https://github.com/wence-) +- Patch dask-expr `var` logic in dask-cudf ([#15347](https://github.com/rapidsai/cudf/pull/15347)) [@rjzamora](https://github.com/rjzamora) +- Fix for logical and syntactical errors in libcudf c++ examples ([#15346](https://github.com/rapidsai/cudf/pull/15346)) [@mhaseeb123](https://github.com/mhaseeb123) +- Disable dask-expr in docs builds. ([#15343](https://github.com/rapidsai/cudf/pull/15343)) [@bdice](https://github.com/bdice) +- Apply the cuFile error work around to data_sink as well ([#15335](https://github.com/rapidsai/cudf/pull/15335)) [@vuule](https://github.com/vuule) +- Fix parquet predicate filtering with column projection ([#15113](https://github.com/rapidsai/cudf/pull/15113)) [@karthikeyann](https://github.com/karthikeyann) +- Check column type equality, handling nested types correctly. ([#14531](https://github.com/rapidsai/cudf/pull/14531)) [@bdice](https://github.com/bdice) + +## 📖 Documentation + +- Fix docs for IO readers and strings_convert ([#15842](https://github.com/rapidsai/cudf/pull/15842)) [@bdice](https://github.com/bdice) +- Update cudf.pandas docs for GA ([#15744](https://github.com/rapidsai/cudf/pull/15744)) [@beckernick](https://github.com/beckernick) +- Add contributing warning about circular imports ([#15691](https://github.com/rapidsai/cudf/pull/15691)) [@er-eis](https://github.com/er-eis) +- Update libcudf developer guide for strings offsets column ([#15661](https://github.com/rapidsai/cudf/pull/15661)) [@davidwendt](https://github.com/davidwendt) +- Update developer guide with device_async_resource_ref guidelines ([#15562](https://github.com/rapidsai/cudf/pull/15562)) [@harrism](https://github.com/harrism) +- DOC: add pandas intersphinx mapping ([#15531](https://github.com/rapidsai/cudf/pull/15531)) [@raybellwaves](https://github.com/raybellwaves) +- rm-dup-doc in frame.py ([#15530](https://github.com/rapidsai/cudf/pull/15530)) [@raybellwaves](https://github.com/raybellwaves) +- Update CONTRIBUTING.md to use latest cuda env ([#15467](https://github.com/rapidsai/cudf/pull/15467)) [@raybellwaves](https://github.com/raybellwaves) +- Doc: interleave columns pandas compat ([#15383](https://github.com/rapidsai/cudf/pull/15383)) [@raybellwaves](https://github.com/raybellwaves) +- Simplified README Examples ([#15338](https://github.com/rapidsai/cudf/pull/15338)) [@wkaisertexas](https://github.com/wkaisertexas) +- Add debug tips section to libcudf developer guide ([#15329](https://github.com/rapidsai/cudf/pull/15329)) [@davidwendt](https://github.com/davidwendt) +- Fix and clarify notes on result ordering ([#13255](https://github.com/rapidsai/cudf/pull/13255)) [@shwina](https://github.com/shwina) + +## 🚀 New Features + +- Add JNI bindings for zstd compression of NVCOMP. ([#15729](https://github.com/rapidsai/cudf/pull/15729)) [@firestarman](https://github.com/firestarman) +- Fix spaces around CSV quoted strings ([#15727](https://github.com/rapidsai/cudf/pull/15727)) [@thabetx](https://github.com/thabetx) +- Add default pinned pool that falls back to new pinned allocations ([#15665](https://github.com/rapidsai/cudf/pull/15665)) [@vuule](https://github.com/vuule) +- Overhaul ops-codeowners coverage ([#15660](https://github.com/rapidsai/cudf/pull/15660)) [@raydouglass](https://github.com/raydouglass) +- Concatenate dictionary of objects along axis=1 ([#15623](https://github.com/rapidsai/cudf/pull/15623)) [@er-eis](https://github.com/er-eis) +- Construct `pylibcudf` columns from objects supporting `__cuda_array_interface__` ([#15615](https://github.com/rapidsai/cudf/pull/15615)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Expose some Parquet per-column configuration options via the python API ([#15613](https://github.com/rapidsai/cudf/pull/15613)) [@etseidl](https://github.com/etseidl) +- Migrate string `find` operations to `pylibcudf` ([#15604](https://github.com/rapidsai/cudf/pull/15604)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Round trip FIXED_LEN_BYTE_ARRAY data properly in Parquet writer ([#15600](https://github.com/rapidsai/cudf/pull/15600)) [@etseidl](https://github.com/etseidl) +- Reading multi-line JSON in string columns using runtime configurable delimiter ([#15556](https://github.com/rapidsai/cudf/pull/15556)) [@shrshi](https://github.com/shrshi) +- Remove public gtest dependency from libcudf conda package ([#15534](https://github.com/rapidsai/cudf/pull/15534)) [@robertmaynard](https://github.com/robertmaynard) +- Fea/move to latest nanoarrow ([#15526](https://github.com/rapidsai/cudf/pull/15526)) [@robertmaynard](https://github.com/robertmaynard) +- Migrate string `case` operations to `pylibcudf` ([#15489](https://github.com/rapidsai/cudf/pull/15489)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add Parquet encoding statistics to column chunk metadata ([#15452](https://github.com/rapidsai/cudf/pull/15452)) [@etseidl](https://github.com/etseidl) +- Implement JNI for chunked ORC reader ([#15446](https://github.com/rapidsai/cudf/pull/15446)) [@ttnghia](https://github.com/ttnghia) +- Add some missing optional fields to the Parquet RowGroup metadata ([#15421](https://github.com/rapidsai/cudf/pull/15421)) [@etseidl](https://github.com/etseidl) +- Adding parquet transcoding example ([#15420](https://github.com/rapidsai/cudf/pull/15420)) [@mhaseeb123](https://github.com/mhaseeb123) +- Add fields to Parquet Statistics structure that were added in parquet-format 2.10 ([#15412](https://github.com/rapidsai/cudf/pull/15412)) [@etseidl](https://github.com/etseidl) +- Add option to Parquet writer to skip compressing individual columns ([#15411](https://github.com/rapidsai/cudf/pull/15411)) [@etseidl](https://github.com/etseidl) +- Add BYTE_STREAM_SPLIT support to Parquet ([#15311](https://github.com/rapidsai/cudf/pull/15311)) [@etseidl](https://github.com/etseidl) +- Introduce benchmark suite for JSON reader options ([#15124](https://github.com/rapidsai/cudf/pull/15124)) [@shrshi](https://github.com/shrshi) +- Implement ORC chunked reader ([#15094](https://github.com/rapidsai/cudf/pull/15094)) [@ttnghia](https://github.com/ttnghia) +- Extend cudf devcontainers to specify jitify2 kernel cache ([#15068](https://github.com/rapidsai/cudf/pull/15068)) [@robertmaynard](https://github.com/robertmaynard) +- Add `to_arrow_device` function to cudf interop using nanoarrow ([#15047](https://github.com/rapidsai/cudf/pull/15047)) [@zeroshade](https://github.com/zeroshade) +- Add JSON option to prune columns ([#14996](https://github.com/rapidsai/cudf/pull/14996)) [@karthikeyann](https://github.com/karthikeyann) + +## 🛠️ Improvements + +- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar) +- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1) +- Deprecate `divisions='quantile'` support in `set_index` ([#15804](https://github.com/rapidsai/cudf/pull/15804)) [@rjzamora](https://github.com/rjzamora) +- Improve performance of Series.to_numpy/to_cupy ([#15792](https://github.com/rapidsai/cudf/pull/15792)) [@mroeschke](https://github.com/mroeschke) +- Access `self.index` instead of `self._index` where possible ([#15781](https://github.com/rapidsai/cudf/pull/15781)) [@mroeschke](https://github.com/mroeschke) +- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123) +- Avoid index-to-column conversion in some DataFrame ops ([#15763](https://github.com/rapidsai/cudf/pull/15763)) [@mroeschke](https://github.com/mroeschke) +- Fix `chunked_parquet_reader` behavior when input has no more rows to read ([#15757](https://github.com/rapidsai/cudf/pull/15757)) [@mhaseeb123](https://github.com/mhaseeb123) +- [JNI] Expose java API for cudf::io::config_host_memory_resource ([#15745](https://github.com/rapidsai/cudf/pull/15745)) [@abellina](https://github.com/abellina) +- Migrate all cpp pxd files into pylibcudf ([#15740](https://github.com/rapidsai/cudf/pull/15740)) [@vyasr](https://github.com/vyasr) +- Validate and materialize iterators earlier in as_column ([#15739](https://github.com/rapidsai/cudf/pull/15739)) [@mroeschke](https://github.com/mroeschke) +- Push some as_column arrow logic to ColumnBase.from_arrow ([#15738](https://github.com/rapidsai/cudf/pull/15738)) [@mroeschke](https://github.com/mroeschke) +- Expose stream parameter in public reduction APIs ([#15737](https://github.com/rapidsai/cudf/pull/15737)) [@srinivasyadav18](https://github.com/srinivasyadav18) +- remove unnecessary 'setuptools' host dependency, simplify dependencies.yaml ([#15736](https://github.com/rapidsai/cudf/pull/15736)) [@jameslamb](https://github.com/jameslamb) +- Defer to C++ equality and hashing for pylibcudf DataType and Aggregation objects ([#15732](https://github.com/rapidsai/cudf/pull/15732)) [@wence-](https://github.com/wence-) +- Implement null-aware NOT_EQUALS binop ([#15731](https://github.com/rapidsai/cudf/pull/15731)) [@wence-](https://github.com/wence-) +- Fix split-record result list column offset type ([#15707](https://github.com/rapidsai/cudf/pull/15707)) [@davidwendt](https://github.com/davidwendt) +- Upgrade `arrow` to `16` ([#15703](https://github.com/rapidsai/cudf/pull/15703)) [@galipremsagar](https://github.com/galipremsagar) +- Remove experimental namespace from make_strings_children ([#15702](https://github.com/rapidsai/cudf/pull/15702)) [@davidwendt](https://github.com/davidwendt) +- Rework get_json_object benchmark to use nvbench ([#15698](https://github.com/rapidsai/cudf/pull/15698)) [@davidwendt](https://github.com/davidwendt) +- Rework some python tests of Parquet delta encodings ([#15693](https://github.com/rapidsai/cudf/pull/15693)) [@etseidl](https://github.com/etseidl) +- Skeleton cudf polars package ([#15688](https://github.com/rapidsai/cudf/pull/15688)) [@wence-](https://github.com/wence-) +- Upgrade pre commit hooks ([#15685](https://github.com/rapidsai/cudf/pull/15685)) [@wence-](https://github.com/wence-) +- Allow `fillna` to validate for `CategoricalColumn.fillna` ([#15683](https://github.com/rapidsai/cudf/pull/15683)) [@galipremsagar](https://github.com/galipremsagar) +- Misc Column cleanups ([#15682](https://github.com/rapidsai/cudf/pull/15682)) [@mroeschke](https://github.com/mroeschke) +- Reducing runtime of JSON reader options benchmark ([#15681](https://github.com/rapidsai/cudf/pull/15681)) [@shrshi](https://github.com/shrshi) +- Add `Timestamp` and `Timedelta` proxy types ([#15680](https://github.com/rapidsai/cudf/pull/15680)) [@galipremsagar](https://github.com/galipremsagar) +- Remove host_parse_nested_json. ([#15674](https://github.com/rapidsai/cudf/pull/15674)) [@bdice](https://github.com/bdice) +- Reduce runtime for ParquetChunkedReaderInputLimitTest gtests ([#15672](https://github.com/rapidsai/cudf/pull/15672)) [@davidwendt](https://github.com/davidwendt) +- Add large-strings gtest for cudf::interleave_columns ([#15669](https://github.com/rapidsai/cudf/pull/15669)) [@davidwendt](https://github.com/davidwendt) +- Use experimental make_strings_children for multi-replace_re ([#15667](https://github.com/rapidsai/cudf/pull/15667)) [@davidwendt](https://github.com/davidwendt) +- Enabled `Holiday` types in `cudf.pandas` ([#15664](https://github.com/rapidsai/cudf/pull/15664)) [@galipremsagar](https://github.com/galipremsagar) +- Remove obsolete `XFAIL` markers for query-planning ([#15662](https://github.com/rapidsai/cudf/pull/15662)) [@rjzamora](https://github.com/rjzamora) +- Clean up join benchmarks ([#15644](https://github.com/rapidsai/cudf/pull/15644)) [@PointKernel](https://github.com/PointKernel) +- Enable warnings as errors in custreamz ([#15642](https://github.com/rapidsai/cudf/pull/15642)) [@mroeschke](https://github.com/mroeschke) +- Improve distinct join with set `retrieve` ([#15636](https://github.com/rapidsai/cudf/pull/15636)) [@PointKernel](https://github.com/PointKernel) +- Fix -Werror=type-limits. ([#15635](https://github.com/rapidsai/cudf/pull/15635)) [@bdice](https://github.com/bdice) +- Enable FutureWarnings/DeprecationWarnings as errors for dask_cudf ([#15634](https://github.com/rapidsai/cudf/pull/15634)) [@mroeschke](https://github.com/mroeschke) +- Remove NVBench SHA override. ([#15633](https://github.com/rapidsai/cudf/pull/15633)) [@alliepiper](https://github.com/alliepiper) +- Add support for large string columns to Parquet reader and writer ([#15632](https://github.com/rapidsai/cudf/pull/15632)) [@etseidl](https://github.com/etseidl) +- Large strings support in MD5 and SHA hashers ([#15631](https://github.com/rapidsai/cudf/pull/15631)) [@davidwendt](https://github.com/davidwendt) +- Fix make_offsets_child_column usage in cudf::strings::detail::shift ([#15630](https://github.com/rapidsai/cudf/pull/15630)) [@davidwendt](https://github.com/davidwendt) +- Use experimental make_strings_children for strings convert ([#15629](https://github.com/rapidsai/cudf/pull/15629)) [@davidwendt](https://github.com/davidwendt) +- Forward-merge branch-24.04 to branch-24.06 ([#15627](https://github.com/rapidsai/cudf/pull/15627)) [@bdice](https://github.com/bdice) +- Avoid accessing attributes via `_column` if not needed ([#15624](https://github.com/rapidsai/cudf/pull/15624)) [@mroeschke](https://github.com/mroeschke) +- Make ColumnBase.__cuda_array_interface__ opt out instead of opt in ([#15622](https://github.com/rapidsai/cudf/pull/15622)) [@mroeschke](https://github.com/mroeschke) +- Large strings support for cudf::gather ([#15621](https://github.com/rapidsai/cudf/pull/15621)) [@davidwendt](https://github.com/davidwendt) +- Remove jni-docker-build workflow ([#15619](https://github.com/rapidsai/cudf/pull/15619)) [@bdice](https://github.com/bdice) +- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123) +- Drop Centos7 support ([#15608](https://github.com/rapidsai/cudf/pull/15608)) [@NvTimLiu](https://github.com/NvTimLiu) +- Use experimental make_strings_children for json/csv writers ([#15599](https://github.com/rapidsai/cudf/pull/15599)) [@davidwendt](https://github.com/davidwendt) +- Use experimental make_strings_children for strings join/url_encode/slice ([#15598](https://github.com/rapidsai/cudf/pull/15598)) [@davidwendt](https://github.com/davidwendt) +- Use experimental make_strings_children in nvtext APIs ([#15595](https://github.com/rapidsai/cudf/pull/15595)) [@davidwendt](https://github.com/davidwendt) +- Migrate to `{{ stdlib("c") }}` ([#15594](https://github.com/rapidsai/cudf/pull/15594)) [@hcho3](https://github.com/hcho3) +- Deprecate `to/from_dask_dataframe` APIs in dask-cudf ([#15592](https://github.com/rapidsai/cudf/pull/15592)) [@rjzamora](https://github.com/rjzamora) +- Minor fixups for future NumPy 2 compatibility ([#15590](https://github.com/rapidsai/cudf/pull/15590)) [@seberg](https://github.com/seberg) +- Delay materializing RangeIndex in .reset_index ([#15588](https://github.com/rapidsai/cudf/pull/15588)) [@mroeschke](https://github.com/mroeschke) +- Use experimental make_strings_children for capitalize/case/pad functions ([#15587](https://github.com/rapidsai/cudf/pull/15587)) [@davidwendt](https://github.com/davidwendt) +- Use experimental make_strings_children for strings replace/filter/translate ([#15586](https://github.com/rapidsai/cudf/pull/15586)) [@davidwendt](https://github.com/davidwendt) +- Add multithreaded parquet reader benchmarks. ([#15585](https://github.com/rapidsai/cudf/pull/15585)) [@nvdbaranec](https://github.com/nvdbaranec) +- Don't materialize column during RangeIndex methods ([#15582](https://github.com/rapidsai/cudf/pull/15582)) [@mroeschke](https://github.com/mroeschke) +- Improve performance for cudf::strings::count_re ([#15578](https://github.com/rapidsai/cudf/pull/15578)) [@davidwendt](https://github.com/davidwendt) +- Replace RangeIndex._start/_stop/_step with _range ([#15576](https://github.com/rapidsai/cudf/pull/15576)) [@mroeschke](https://github.com/mroeschke) +- add --rm and --name to devcontainer run args ([#15572](https://github.com/rapidsai/cudf/pull/15572)) [@trxcllnt](https://github.com/trxcllnt) +- Change the default dictionary policy in Parquet writer from `ALWAYS` to `ADAPTIVE` ([#15570](https://github.com/rapidsai/cudf/pull/15570)) [@mhaseeb123](https://github.com/mhaseeb123) +- Rename experimental JSON tests. ([#15568](https://github.com/rapidsai/cudf/pull/15568)) [@bdice](https://github.com/bdice) +- Refactor JNI native dependency loading to allow returning of library path ([#15566](https://github.com/rapidsai/cudf/pull/15566)) [@jlowe](https://github.com/jlowe) +- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice) +- Deprecate legacy JSON reader options. ([#15558](https://github.com/rapidsai/cudf/pull/15558)) [@bdice](https://github.com/bdice) +- Use same .clang-format in cuDF JNI ([#15557](https://github.com/rapidsai/cudf/pull/15557)) [@bdice](https://github.com/bdice) +- Large strings support for cudf::fill ([#15555](https://github.com/rapidsai/cudf/pull/15555)) [@davidwendt](https://github.com/davidwendt) +- Upgrade upper bound pinning to `pandas-2.2.2` ([#15554](https://github.com/rapidsai/cudf/pull/15554)) [@galipremsagar](https://github.com/galipremsagar) +- Work around issues with cccl main ([#15552](https://github.com/rapidsai/cudf/pull/15552)) [@miscco](https://github.com/miscco) +- Enable pandas plotting unit tests for cudf.pandas ([#15547](https://github.com/rapidsai/cudf/pull/15547)) [@mroeschke](https://github.com/mroeschke) +- Move timezone conversion logic to `DatetimeColumn` ([#15545](https://github.com/rapidsai/cudf/pull/15545)) [@mroeschke](https://github.com/mroeschke) +- Large strings support for cudf::interleave_columns ([#15544](https://github.com/rapidsai/cudf/pull/15544)) [@davidwendt](https://github.com/davidwendt) +- [skip ci] Switch back to 24.06 branch for pandas tests ([#15543](https://github.com/rapidsai/cudf/pull/15543)) [@galipremsagar](https://github.com/galipremsagar) +- Remove checks dependency from static-configure test job. ([#15542](https://github.com/rapidsai/cudf/pull/15542)) [@bdice](https://github.com/bdice) +- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice) +- Enable more ignored pandas unit tests for cudf.pandas ([#15535](https://github.com/rapidsai/cudf/pull/15535)) [@mroeschke](https://github.com/mroeschke) +- Large strings support for cudf::clamp ([#15533](https://github.com/rapidsai/cudf/pull/15533)) [@davidwendt](https://github.com/davidwendt) +- Remove version hard-coding ([#15529](https://github.com/rapidsai/cudf/pull/15529)) [@galipremsagar](https://github.com/galipremsagar) +- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123) +- Make some private class properties not settable ([#15527](https://github.com/rapidsai/cudf/pull/15527)) [@mroeschke](https://github.com/mroeschke) +- Large strings support in regex replace APIs ([#15524](https://github.com/rapidsai/cudf/pull/15524)) [@davidwendt](https://github.com/davidwendt) +- Skip pandas unit tests that crash pytest workers in `cudf.pandas` ([#15521](https://github.com/rapidsai/cudf/pull/15521)) [@mroeschke](https://github.com/mroeschke) +- Preserve column metadata during more DataFrame operations ([#15519](https://github.com/rapidsai/cudf/pull/15519)) [@mroeschke](https://github.com/mroeschke) +- Move to pandas-tests to a dedicated workflow file and trigger it from branch.yaml ([#15516](https://github.com/rapidsai/cudf/pull/15516)) [@galipremsagar](https://github.com/galipremsagar) +- Large strings gtest fixture and utilities ([#15513](https://github.com/rapidsai/cudf/pull/15513)) [@davidwendt](https://github.com/davidwendt) +- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism) +- Relax protobuf lower bound to 3.20. ([#15506](https://github.com/rapidsai/cudf/pull/15506)) [@bdice](https://github.com/bdice) +- Clean up index methods ([#15496](https://github.com/rapidsai/cudf/pull/15496)) [@mroeschke](https://github.com/mroeschke) +- Update strings contains benchmarks to nvbench ([#15495](https://github.com/rapidsai/cudf/pull/15495)) [@davidwendt](https://github.com/davidwendt) +- Update NVBench fixture to use new hooks, fix pinned memory segfault. ([#15492](https://github.com/rapidsai/cudf/pull/15492)) [@alliepiper](https://github.com/alliepiper) +- Enable tests/scalar and test/series in cudf.pandas tests ([#15486](https://github.com/rapidsai/cudf/pull/15486)) [@mroeschke](https://github.com/mroeschke) +- Clean up __cuda_array_interface__ handling in as_column ([#15477](https://github.com/rapidsai/cudf/pull/15477)) [@mroeschke](https://github.com/mroeschke) +- Avoid .ordered and .categories from being settable in CategoricalColumn and CategoricalDtype ([#15475](https://github.com/rapidsai/cudf/pull/15475)) [@mroeschke](https://github.com/mroeschke) +- Ignore pandas tests for cudf.pandas that need motoserver ([#15468](https://github.com/rapidsai/cudf/pull/15468)) [@mroeschke](https://github.com/mroeschke) +- Use cached_property for NumericColumn.nan_count instead of ._nan_count variable ([#15466](https://github.com/rapidsai/cudf/pull/15466)) [@mroeschke](https://github.com/mroeschke) +- Add to_arrow_device() functions that accept views ([#15465](https://github.com/rapidsai/cudf/pull/15465)) [@davidwendt](https://github.com/davidwendt) +- Add custom status check workflow ([#15464](https://github.com/rapidsai/cudf/pull/15464)) [@galipremsagar](https://github.com/galipremsagar) +- Disable pandas 2.x clipboard tests in cudf.pandas tests ([#15462](https://github.com/rapidsai/cudf/pull/15462)) [@mroeschke](https://github.com/mroeschke) +- Enable tests/strings/test_api.py and tests/io/pytables in cudf.pandas tests ([#15461](https://github.com/rapidsai/cudf/pull/15461)) [@mroeschke](https://github.com/mroeschke) +- Enable test_parsing in cudf.pandas tests ([#15460](https://github.com/rapidsai/cudf/pull/15460)) [@mroeschke](https://github.com/mroeschke) +- Add `from_arrow_device` function to cudf interop using nanoarrow ([#15458](https://github.com/rapidsai/cudf/pull/15458)) [@zeroshade](https://github.com/zeroshade) +- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt) +- Enable tests/windows/ in cudf.pandas tests ([#15444](https://github.com/rapidsai/cudf/pull/15444)) [@mroeschke](https://github.com/mroeschke) +- Enable tests/interchange/test_impl.py in cudf.pandas tests ([#15443](https://github.com/rapidsai/cudf/pull/15443)) [@mroeschke](https://github.com/mroeschke) +- Enable tests/io/test_user_agent.py in cudf pandas tests ([#15442](https://github.com/rapidsai/cudf/pull/15442)) [@mroeschke](https://github.com/mroeschke) +- Performance improvement in libcudf case conversion for long strings ([#15441](https://github.com/rapidsai/cudf/pull/15441)) [@davidwendt](https://github.com/davidwendt) +- Remove prior test skipping in run-pandas-tests with testing 2.2.1 ([#15440](https://github.com/rapidsai/cudf/pull/15440)) [@mroeschke](https://github.com/mroeschke) +- Support orc and text IO with dask-expr using legacy conversion ([#15439](https://github.com/rapidsai/cudf/pull/15439)) [@rjzamora](https://github.com/rjzamora) +- Floating <--> fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Unify Copy-On-Write and Spilling ([#15436](https://github.com/rapidsai/cudf/pull/15436)) [@madsbk](https://github.com/madsbk) +- Enable ``dask_cudf`` json and s3 tests with query-planning on ([#15408](https://github.com/rapidsai/cudf/pull/15408)) [@rjzamora](https://github.com/rjzamora) +- Bump ruff and codespell pre-commit checks ([#15407](https://github.com/rapidsai/cudf/pull/15407)) [@mroeschke](https://github.com/mroeschke) +- Enable all tests for `arm` arch ([#15402](https://github.com/rapidsai/cudf/pull/15402)) [@galipremsagar](https://github.com/galipremsagar) +- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123) +- Optimizing multi-source byte range reading in JSON reader ([#15396](https://github.com/rapidsai/cudf/pull/15396)) [@shrshi](https://github.com/shrshi) +- add correct labels to pandas_function_request.md ([#15381](https://github.com/rapidsai/cudf/pull/15381)) [@raybellwaves](https://github.com/raybellwaves) +- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt) +- Large strings support in cudf::merge ([#15374](https://github.com/rapidsai/cudf/pull/15374)) [@davidwendt](https://github.com/davidwendt) +- Enable test-reporting for pandas pytests in CI ([#15369](https://github.com/rapidsai/cudf/pull/15369)) [@galipremsagar](https://github.com/galipremsagar) +- Use logical types in Parquet reader ([#15365](https://github.com/rapidsai/cudf/pull/15365)) [@etseidl](https://github.com/etseidl) +- Add experimental make_strings_children utility ([#15363](https://github.com/rapidsai/cudf/pull/15363)) [@davidwendt](https://github.com/davidwendt) +- Forward-merge branch-24.04 to branch-24.06 ([#15349](https://github.com/rapidsai/cudf/pull/15349)) [@bdice](https://github.com/bdice) +- Fix CMake files in libcudf C++ examples to use existing libcudf build if present ([#15348](https://github.com/rapidsai/cudf/pull/15348)) [@mhaseeb123](https://github.com/mhaseeb123) +- Use ruff pydocstyle over pydocstyle pre-commit hook ([#15345](https://github.com/rapidsai/cudf/pull/15345)) [@mroeschke](https://github.com/mroeschke) +- Refactor stream mode setup for gtests ([#15337](https://github.com/rapidsai/cudf/pull/15337)) [@davidwendt](https://github.com/davidwendt) +- Benchmark decimal <--> floating conversions. ([#15334](https://github.com/rapidsai/cudf/pull/15334)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Avoid duplicate dask-cudf testing ([#15333](https://github.com/rapidsai/cudf/pull/15333)) [@rjzamora](https://github.com/rjzamora) +- Skip decode steps in Parquet reader when nullable columns have no nulls ([#15332](https://github.com/rapidsai/cudf/pull/15332)) [@etseidl](https://github.com/etseidl) +- Update udf_cpp to use rapids_cpm_cccl. ([#15331](https://github.com/rapidsai/cudf/pull/15331)) [@bdice](https://github.com/bdice) +- Forward-merge branch-24.04 into branch-24.06 [skip ci] ([#15330](https://github.com/rapidsai/cudf/pull/15330)) [@rapids-bot[bot]](https://github.com/rapids-bot[bot]) +- Allow ``numeric_only=True`` for simple groupby reductions ([#15326](https://github.com/rapidsai/cudf/pull/15326)) [@rjzamora](https://github.com/rjzamora) +- Drop CentOS 7 support. ([#15323](https://github.com/rapidsai/cudf/pull/15323)) [@bdice](https://github.com/bdice) +- Rework cudf::find_and_replace_all to use gather-based make_strings_column ([#15305](https://github.com/rapidsai/cudf/pull/15305)) [@davidwendt](https://github.com/davidwendt) +- First pass at adding testing for pylibcudf ([#15300](https://github.com/rapidsai/cudf/pull/15300)) [@vyasr](https://github.com/vyasr) +- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar) +- Rework cudf::replace_nulls to use strings::detail::copy_if_else ([#15286](https://github.com/rapidsai/cudf/pull/15286)) [@davidwendt](https://github.com/davidwendt) +- Clean up special casing in `as_column` for non-typed input ([#15276](https://github.com/rapidsai/cudf/pull/15276)) [@mroeschke](https://github.com/mroeschke) +- Large strings support in cudf::concatenate ([#15195](https://github.com/rapidsai/cudf/pull/15195)) [@davidwendt](https://github.com/davidwendt) +- Use less _is_categorical_dtype ([#15148](https://github.com/rapidsai/cudf/pull/15148)) [@mroeschke](https://github.com/mroeschke) +- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke) +- `ModuleAccelerator` performance: cache the result of checking if a caller is in the denylist ([#15056](https://github.com/rapidsai/cudf/pull/15056)) [@shwina](https://github.com/shwina) +- Use offsetalator in cudf::strings::replace functions ([#14824](https://github.com/rapidsai/cudf/pull/14824)) [@davidwendt](https://github.com/davidwendt) +- Cleanup some timedelta/datetime column logic ([#14715](https://github.com/rapidsai/cudf/pull/14715)) [@mroeschke](https://github.com/mroeschke) +- Refactor numpy array input in as_column ([#14651](https://github.com/rapidsai/cudf/pull/14651)) [@mroeschke](https://github.com/mroeschke) +- Refactor joins for conditional semis and antis ([#14646](https://github.com/rapidsai/cudf/pull/14646)) [@DanialJavady96](https://github.com/DanialJavady96) +- Eagerly populate the class dict for cudf.pandas proxy types ([#14534](https://github.com/rapidsai/cudf/pull/14534)) [@shwina](https://github.com/shwina) +- Some additional kernel thread index refactoring. ([#14107](https://github.com/rapidsai/cudf/pull/14107)) [@bdice](https://github.com/bdice) + # cuDF 24.04.00 (10 Apr 2024) ## 🚨 Breaking Changes From db1b36592ba5d76158d1c6e1a3c6440c25a382e7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:20 -0700 Subject: [PATCH 13/48] Migrate string replace.pxd to pylibcudf (#15839) xref #15162 Change replace.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15839 --- .../user_guide/api_docs/pylibcudf/index.rst | 8 +- .../api_docs/pylibcudf/strings/index.rst | 7 + .../api_docs/pylibcudf/strings/replace.rst | 6 + .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../cudf/_lib/pylibcudf/strings/replace.pxd | 25 +++ .../cudf/_lib/pylibcudf/strings/replace.pyx | 162 ++++++++++++++++++ python/cudf/cudf/_lib/strings/replace.pyx | 99 +++-------- .../pylibcudf_tests/test_string_replace.py | 126 ++++++++++++++ 10 files changed, 362 insertions(+), 79 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_replace.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 58fea77adaa..b6ad1157511 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -6,7 +6,7 @@ This page provides API documentation for pylibcudf. .. toctree:: :maxdepth: 1 - :caption: API Documentation + :caption: Top-level modules aggregation binaryop @@ -32,3 +32,9 @@ This page provides API documentation for pylibcudf. table types unary + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst new file mode 100644 index 00000000000..8970fc80c0b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -0,0 +1,7 @@ +strings +======= + +.. toctree:: + :maxdepth: 1 + + replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst new file mode 100644 index 00000000000..9575ec226a7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst @@ -0,0 +1,6 @@ +======= +replace +======= + +.. automodule:: cudf._lib.pylibcudf.strings.replace + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index 0e9c1c916f0..c9a983e24f4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,11 +12,11 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index ec3dbc150b5..7563df8a107 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find +from . cimport capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 3793bda0aa4..cb4f0e38f97 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find +from . import capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd new file mode 100644 index 00000000000..52e2dc3c738 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = * +) +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = * +) +cpdef Column replace_slice( + Column input, + Scalar repl = *, + size_type start = *, + size_type stop = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx new file mode 100644 index 00000000000..c757150a600 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( + replace as cpp_replace, + replace_multiple as cpp_replace_multiple, + replace_slice as cpp_replace_slice, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace`. + + Parameters + ---------- + input : Column + The input strings + target : Scalar + String to search for in each string. + repl : Scalar + String to replace target with. + maxrepl : size_type, default -1 + Maximum times to replace if target appears multiple times in the input string. + Default of -1 specifies to replace all occurrences of target in each string. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef: + unique_ptr[column] c_result + const string_scalar* target_str + const string_scalar* repl_str + + target_str = (target.c_obj.get()) + repl_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace( + input.view(), + target_str[0], + repl_str[0], + maxrepl, + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace_multiple`. + + Parameters + ---------- + input : Column + The input strings + target : Column + Column containing strings to search for in the input column. + repl : Column + Column containing strings to replace target with. + Each target, when found, will be replaced by the value at the + corresponding index in the repl Column. + + Must be of the same length as target. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_replace_multiple( + input.view(), + target.view(), + repl.view(), + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_slice( + Column input, + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 + Scalar repl = None, + size_type start = 0, + size_type stop = -1 +): + """Replaces each string in the column with the provided repl string + within the [start,stop) character position range. + + Null string entries will return null output string entries. + This function can be used to insert a string into specific position + by specifying the same position value for start and stop. + The repl string can be appended to each string by specifying -1 + for both start and stop. + + For details, see :cpp:func:`replace_slice`. + + Parameters + ---------- + input : Column + The input strings + repl : Scalar, default "" + String scalar to replace target with. + start : size_type, default 0 + Start position where repl will be added. + stop : size_type, default -1 + End position (exclusive) to use for replacement. + Returns + ------- + pylibcudf.Column + New string column + """ + cdef unique_ptr[column] c_result + + if repl is None: + repl = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* scalar_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace_slice( + input.view(), + scalar_str[0], + start, + stop + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 2d9330a8a24..374831f1833 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -1,23 +1,15 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar -from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( - replace as cpp_replace, - replace_multiple as cpp_replace_multiple, - replace_slice as cpp_replace_slice, -) from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_replace(Column source_strings, @@ -32,22 +24,12 @@ def slice_replace(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - stop - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + stop + )) @acquire_spill_lock() @@ -61,22 +43,12 @@ def insert(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - start - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + start, + )) @acquire_spill_lock() @@ -92,25 +64,12 @@ def replace(Column source_strings, cdef DeviceScalar target = py_target.device_value cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_target = ( - target.get_raw_ptr() - ) - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace( - source_view, - scalar_target[0], - scalar_repl[0], - maxrepl - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace( + source_strings.to_pylibcudf(mode="read"), + target.c_value, + repl.c_value, + maxrepl + )) @acquire_spill_lock() @@ -121,16 +80,8 @@ def replace_multi(Column source_strings, Returns a Column after replacing occurrences of patterns `target_strings` with `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - cdef column_view repl_view = repl_strings.view() - - with nogil: - c_result = move(cpp_replace_multiple( - source_view, - target_view, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read"), + repl_strings.to_pylibcudf(mode="read"), + )) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py new file mode 100644 index 00000000000..f20edf6a506 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def data_col(): + pa_data_col = pa.array( + ["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"]) +def scalar_repl_target(request): + pa_target = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_target) + + +@pytest.fixture(scope="module", params=["b", "B", "", "B́"]) +def scalar_repl(request): + pa_repl = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_repl) + + +@pytest.fixture( + scope="module", + params=[ + ["a", "c", "A", "ÁÁÁÁ"], + ], +) +def col_repl_target(request): + pa_target = pa.array(request.param, type=pa.string()) + return (pa_target, plc.interop.from_arrow(pa_target)) + + +@pytest.fixture( + scope="module", + params=[ + [ + "", + "z", + "XX", + "blahblah", + ] + ], +) +def col_repl(request): + pa_repl = pa.array(request.param, type=pa.string()) + return (pa_repl, plc.interop.from_arrow(pa_repl)) + + +@pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10]) +def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = scalar_repl_target + pa_repl, plc_repl = scalar_repl + got = plc.strings.replace.replace( + plc_data_col, plc_target, plc_repl, maxrepl + ) + + expected = pa.compute.replace_substring( + pa_data_col, + pattern=pa_target, + replacement=pa_repl, + max_replacements=maxrepl, + ) + + assert_column_eq(expected, got) + + +@pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)]) +def test_replace_slice(data_col, scalar_repl, startstop): + pa_data_col, plc_data_col = data_col + pa_repl, plc_repl = scalar_repl + start, stop = startstop + got = plc.strings.replace.replace_slice( + plc_data_col, plc_repl, start, stop + ) + + if stop == -1: + # pyarrow doesn't support -1 as stop, so just set to really big number + + # TODO: once libcudf's count_characters() is migrated, we can call + # count_characters on the input, take the max and set stop to that + stop = 1000 + + expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl) + + assert_column_eq(expected, got) + + +def test_replace_col(data_col, col_repl_target, col_repl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = col_repl_target + pa_repl, plc_repl = col_repl + got = plc.strings.replace.replace_multiple( + plc_data_col, plc_target, plc_repl + ) + + # There's nothing in pyarrow that does string replace with columns + # for targets/repls, so let's implement our own in python + + def replace_list(elem, targets, repls): + for target, repl in zip(targets, repls): + res = elem.replace(target, repl) + if res != elem: + return res + + targets = pa_target.to_pylist() + repls = pa_repl.to_pylist() + + expected = pa.array( + [ + replace_list(elem, targets, repls) if elem is not None else None + for elem in pa_data_col.to_pylist() + ], + type=pa.string(), + ) + + assert_column_eq(expected, got) From 57aeeb78d85e169ac18b82f51d2b1cbd01b0608d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Jun 2024 06:49:57 -1000 Subject: [PATCH 14/48] Make Frame._dtype an iterator instead of a dict (#15920) A lot of the usages of `Frame._dtype` didn't require the previous `dict` return type since that was just re-iterated over anyways. Also removed a redundant `tuple` call in `Frame._column_names` and `Frame._columns` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15920 --- python/cudf/cudf/core/dataframe.py | 4 ++-- python/cudf/cudf/core/frame.py | 16 +++++++--------- python/cudf/cudf/core/groupby/groupby.py | 16 +++------------- python/cudf/cudf/core/indexed_frame.py | 10 +++++----- python/cudf/cudf/io/csv.py | 5 ++--- python/cudf/cudf/io/json.py | 5 ++--- 6 files changed, 21 insertions(+), 35 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c8f1e872300..9307267b227 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1231,7 +1231,7 @@ def dtypes(self): string object dtype: object """ - return pd.Series(self._dtypes, dtype="object") + return pd.Series(dict(self._dtypes), dtype="object") @property def ndim(self) -> int: @@ -2834,7 +2834,7 @@ def reindex( return df._reindex( column_names=columns, - dtypes=self._dtypes, + dtypes=dict(self._dtypes), deep=copy, index=index, inplace=False, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7326696c994..af8886a44a6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -79,18 +79,16 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? - return tuple(self._data.names) + def _column_names(self) -> Tuple[Any, ...]: + return self._data.names @property - def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? - return tuple(self._data.columns) + def _columns(self) -> Tuple[ColumnBase, ...]: + return self._data.columns @property - def _dtypes(self): - return dict( - zip(self._data.names, (col.dtype for col in self._data.columns)) - ) + def _dtypes(self) -> abc.Iterator: + return zip(self._data.names, (col.dtype for col in self._data.columns)) @property def ndim(self) -> int: @@ -1969,7 +1967,7 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ac8b381cbec..aa96051ea51 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,12 +22,7 @@ from cudf._lib.types import size_type_dtype from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_list_like, - is_numeric_dtype, -) +from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -335,12 +330,8 @@ def dtypes(self): FutureWarning, ) index = self.grouping.keys.unique().sort_values().to_pandas() - obj_dtypes = self.obj._dtypes return pd.DataFrame( - { - name: [obj_dtypes[name]] * len(index) - for name in self.obj._data.names - }, + {name: [dtype] * len(index) for name, dtype in self.obj._dtypes}, index=index, ) @@ -499,8 +490,7 @@ def rank( # treats NaNs the way we treat nulls. if cudf.get_option("mode.pandas_compatible"): if any( - is_float_dtype(typ) - for typ in self.grouping.values._dtypes.values() + col.dtype.kind == "f" for col in self.grouping.values._columns ): raise NotImplementedError( "NaNs are not supported in groupby.rank." diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 688b268d478..ecfcec15337 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -891,7 +891,7 @@ def replace( ) = _get_replacement_values_for_columns( to_replace=to_replace, value=value, - columns_dtype_map=self._dtypes, + columns_dtype_map=dict(self._dtypes), ) for name, col in self._data.items(): @@ -6313,11 +6313,11 @@ def __dask_tokenize__(self): return [ type(self), - str(self._dtypes), + str(dict(self._dtypes)), *[ - normalize_token(cat.categories) - for cat in self._dtypes.values() - if cat == "category" + normalize_token(col.dtype.categories) + for col in self._columns + if col.dtype == "category" ], normalize_token(self.index), normalize_token(self.hash_values().values_host), diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3eeeac405b3..f07764e2ce4 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -132,10 +132,9 @@ def read_csv( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index dd4a0d9eb07..fc3387d5117 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -147,10 +147,9 @@ def read_json( # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is True else dtype - df_dtypes = df._dtypes unspecified_dtypes = { - name: df_dtypes[name] - for name in df._column_names + name: dtype + for name, dtype in df._dtypes if name not in specified_dtypes } default_dtypes = {} From 20aa4442d27ca858796c7890ad0542dbaee542e1 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:25:51 -0400 Subject: [PATCH 15/48] DOC: Add documentation for cudf.pandas in the Developer Guide (#15889) This PR provides documentation for cudf.pandas in the Developer Guide. It will describe the fast-slow proxy wrapping scheme as well as document the `CUDF_PANDAS_DEBUGGING` environment variable created in PR #15837 for issue #14975. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15889 --- .../source/developer_guide/cudf_pandas.md | 121 ++++++++++++++++++ docs/cudf/source/developer_guide/index.md | 1 + 2 files changed, 122 insertions(+) create mode 100644 docs/cudf/source/developer_guide/cudf_pandas.md diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md new file mode 100644 index 00000000000..aeb43f66b2d --- /dev/null +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -0,0 +1,121 @@ +# cudf.pandas +The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the user guide](../cudf_pandas/index.rst). +The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself. + +## fast-slow proxy mechanism +`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type. +The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails. + +### Types: +#### Wrapped Types and Proxy Types +The "wrapped" types/classes are the Pandas and cuDF specific types that have been wrapped into proxy types. +Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively. +In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object. +Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes. + ```python + import cudf.pandas + cudf.pandas.install() + import pandas as xpd + + cudf = xpd._fsproxy_fast + pd = xpd._fsproxy_slow + + s1 = cudf.Series([1,2]) + s2 = pd.Series([1,2]) + s3 = xpd.Series([1,2]) + ``` + +```{note} +Note that users should never have to interact with the wrapped objects directly in this way. +This code is purely for demonstrative purposes. +``` + +#### The Different Kinds of Proxy Types +In `cudf.pandas`, there are two main kinds of proxy types: final types and intermediate types. + +##### Final and Intermediate Proxy Types +Final types are types for which known operations exist for converting an object of a "fast" type to a "slow" type and vice versa. +For example, `cudf.DataFrame` can be converted to Pandas using the method `to_pandas`, and `pd.DataFrame` can be converted to cuDF using the function `cudf.from_pandas`. +Intermediate types are the types of the results of operations invoked on final types. +For example, `xpd.DataFrameGroupBy` is an intermediate type that will be created during a groupby operation on the final type `xpd.DataFrame`. + +##### Attributes and Callable Proxy Types +Final proxy types are typically classes or modules, both of which have attributes. +Classes also have methods. +These attributes and methods must be wrapped as well to support the fast-slow proxy scheme. + +#### Creating New Proxy Types +`_FinalProxy` and `_IntermediateProxy` types are created using the functions `make_final_proxy_type` and `make_intermediate_proxy` type, respectively. +Creating a new final type looks like this. + +```python +DataFrame = make_final_proxy_type( + "DataFrame", + cudf.DataFrame, + pd.DataFrame, + fast_to_slow=lambda fast: fast.to_pandas(), + slow_to_fast=cudf.from_pandas, +) +``` + +### The Fallback Mechanism +Proxied calls are implemented with fallback via [`_fast_slow_function_call`](https://github.com/rapidsai/cudf/blob/57aeeb78d85e169ac18b82f51d2b1cbd01b0608d/python/cudf/cudf/pandas/fast_slow_proxy.py#L869). This implements the mechanism by which we attempt operations the fast way (using cuDF) and then fall back to the slow way (using Pandas) on failure. +The function looks like this: +```python +def _fast_slow_function_call(func: Callable, *args, **kwargs): + try: + ... + fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs) + result = func(*fast_args, **fast_kwargs) + ... + except Exception: + ... + slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + result = func(*slow_args, **slow_kwargs) + ... + return _maybe_wrap_result(result, func, *args, **kwargs), fast +``` +As we can see the function attempts to call `func` the fast way using cuDF and if any `Exception` occurs, it calls the function using Pandas. +In essence, this `try-except` is what allows `cudf.pandas` to support the bulk of the Pandas API. + +At the end, the function wraps the result from either path in a fast-slow proxy object, if necessary. + +#### Converting Proxy Objects +Note that before the `func` is called, the proxy object and its attributes need to be converted to either their cuDF or Pandas implementations. +This conversion is handled in the function `_transform_arg` which both `_fast_arg` and `_slow_arg` call. + +`_transform_arg` is a recursive function that will call itself depending on the type or argument passed to it (eg. `_transform_arg` is called for each element in a list of arguments). + +### Using Metaclasses +`cudf.pandas` uses a [metaclass](https://docs.python.org/3/glossary.html#term-metaclass) called (`_FastSlowProxyMeta`) to find class attributes and classmethods of fast-slow proxy types. +For example, in the snippet below, the `xpd.Series` type is an instance of `_FastSlowProxyMeta`. +Therefore we can access the property `_fsproxy_fast` defined in the metaclass. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as xpd + +print(xpd.Series._fsproxy_fast) # output is cudf.core.series.Series +``` + +## debugging `cudf.pandas` +Several environment variables are available for debugging purposes. + +Setting the environment variable `CUDF_PANDAS_DEBUGGING` produces a warning when the results from cuDF and Pandas differ from one another. +For example, the snippet below produces the warning below. +```python +import cudf.pandas +cudf.pandas.install() +import pandas as pd +import numpy as np + +setattr(pd.Series.mean, "_fsproxy_slow", lambda self, *args, **kwargs: np.float64(1)) +s = pd.Series([1,2,3]) +s.mean() +``` +``` +UserWarning: The results from cudf and pandas were different. The exception was +Arrays are not almost equal to 7 decimals + ACTUAL: 1.0 + DESIRED: 2.0. +``` diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md index 5cafa8f784c..5e099631fc5 100644 --- a/docs/cudf/source/developer_guide/index.md +++ b/docs/cudf/source/developer_guide/index.md @@ -27,4 +27,5 @@ testing benchmarking options pylibcudf +cudf_pandas ``` From d91380ef393e9156c34a078998041a6affca7923 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 5 Jun 2024 21:16:29 -0400 Subject: [PATCH 16/48] Allow tests to be built when stream util is disabled (#15933) Allows cudf to be built with `BUILD_SHARED_LIBS=OFF`, `CUDA_STATIC_RUNTIME=ON` and tests enabled Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Gera Shegalov (https://github.com/gerashegalov) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15933 --- cpp/tests/CMakeLists.txt | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2f2c12f265c..a0d9083c4a4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -68,12 +68,14 @@ function(ConfigureTest CMAKE_TEST_NAME) INSTALL_COMPONENT_SET testing ) - set_tests_properties( - ${CMAKE_TEST_NAME} - PROPERTIES - ENVIRONMENT - "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" - ) + if(CUDF_BUILD_STREAMS_TEST_UTIL) + set_tests_properties( + ${CMAKE_TEST_NAME} + PROPERTIES + ENVIRONMENT + "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" + ) + endif() endfunction() # ################################################################################################## @@ -401,14 +403,10 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu) ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu) # Overwrite the environments set by ConfigureTest -set_tests_properties( - SPAN_TEST - PROPERTIES - ENVIRONMENT - "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$" -) -set_tests_properties( - SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}" +set_property( + TEST SPAN_TEST SPAN_TEST_DEVICE_VECTOR + APPEND + PROPERTY ENVIRONMENT "GTEST_FILTER=-${_allowlist_filter}" ) # ################################################################################################## @@ -671,9 +669,11 @@ target_include_directories(JIT_PARSER_TEST PRIVATE "$ Date: Wed, 5 Jun 2024 20:48:10 -0500 Subject: [PATCH 17/48] Migrate strings `contains` operations to `pylibcudf` (#15880) This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15880 --- .../api_docs/pylibcudf/strings/contains.rst | 6 ++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/regex_flags.pxd | 13 +++-- .../pylibcudf/libcudf/strings/regex_flags.pyx | 0 .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 11 +++- .../cudf/_lib/pylibcudf/strings/__init__.py | 11 +++- .../cudf/_lib/pylibcudf/strings/contains.pxd | 7 +++ .../cudf/_lib/pylibcudf/strings/contains.pyx | 41 ++++++++++++++ .../_lib/pylibcudf/strings/regex_flags.pxd | 2 + .../_lib/pylibcudf/strings/regex_flags.pyx | 4 ++ .../_lib/pylibcudf/strings/regex_program.pxd | 10 ++++ .../_lib/pylibcudf/strings/regex_program.pyx | 37 +++++++++++++ python/cudf/cudf/_lib/strings/contains.pyx | 23 +++----- .../pylibcudf_tests/test_regex_program.py | 13 +++++ .../pylibcudf_tests/test_string_contains.py | 55 +++++++++++++++++++ 17 files changed, 215 insertions(+), 25 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_regex_program.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_contains.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst new file mode 100644 index 00000000000..e5745331bc7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst @@ -0,0 +1,6 @@ +======== +contains +======== + +.. automodule:: cudf._lib.pylibcudf.strings.contains + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 8970fc80c0b..bfaef732555 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -4,4 +4,5 @@ strings .. toctree:: :maxdepth: 1 + contains replace diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt index 930c22781d0..bd6e2e0af02 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx) +set(cython_sources char_types.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd index 2a5701fa6a3..41617f157b7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd @@ -1,9 +1,12 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + cdef extern from "cudf/strings/regex/flags.hpp" \ namespace "cudf::strings" nogil: - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' + cpdef enum class regex_flags(int32_t): + DEFAULT + MULTILINE + DOTALL diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index c9a983e24f4..cb7f71b1912 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx + regex_program.pyx replace.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index 7563df8a107..959aa94737d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find, replace +from . cimport ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index cb4f0e38f97..b7384913286 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find, replace +from . import ( + capitalize, + case, + char_types, + contains, + find, + regex_flags, + regex_program, + replace, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd new file mode 100644 index 00000000000..275aa95d97e --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re(Column input, RegexProgram prog) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx new file mode 100644 index 00000000000..8c598b7c953 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains +from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column contains_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which match the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::contains_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd new file mode 100644 index 00000000000..79937bf574a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx new file mode 100644 index 00000000000..903c2ddd503 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \ + regex_flags as RegexFlags # no-cython-lint diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd new file mode 100644 index 00000000000..61ed268fb2d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + + +cdef class RegexProgram: + cdef unique_ptr[regex_program] c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx new file mode 100644 index 00000000000..d605b0aba02 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags +from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program + +from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags +from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags + + +cdef class RegexProgram: + + def __init__(self, *args, **kwargs): + raise ValueError("Do not instantiate RegexProgram directly, use create") + + @staticmethod + def create(str pattern, int flags): + cdef unique_ptr[regex_program] c_prog + cdef regex_flags c_flags + cdef string c_pattern = pattern.encode() + + cdef RegexProgram ret = RegexProgram.__new__(RegexProgram) + if isinstance(flags, object): + if isinstance(flags, (int, RegexFlags)): + c_flags = flags + with nogil: + c_prog = regex_program.create(c_pattern, c_flags) + + ret.c_obj = move(c_prog) + else: + raise ValueError("flags must be of type RegexFlags") + + return ret diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 087acd8062d..502a1d14696 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar from cudf._lib.pylibcudf.libcudf.strings.contains cimport ( - contains_re as cpp_contains_re, count_re as cpp_count_re, like as cpp_like, matches_re as cpp_matches_re, @@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program from cudf._lib.scalar cimport DeviceScalar +from cudf._lib.pylibcudf.strings import contains +from cudf._lib.pylibcudf.strings.regex_program import RegexProgram + @acquire_spill_lock() def contains_re(Column source_strings, object reg_ex, uint32_t flags): @@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column of boolean values with True for `source_strings` that contain regular expression `reg_ex`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_contains_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py new file mode 100644 index 00000000000..3a9bcec3616 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize("pat", ["(", "*", "\\"]) +def test_regex_program_invalid(pat): + with pytest.raises(RuntimeError): + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py new file mode 100644 index 00000000000..8cdb6f7c521 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def pa_target_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +@pytest.fixture(scope="module") +def plc_target_col(pa_target_col): + return plc.interop.from_arrow(pa_target_col) + + +@pytest.fixture( + params=[ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], + scope="module", +) +def pa_target_scalar(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture(scope="module") +def plc_target_pat(pa_target_scalar): + prog = plc.strings.regex_program.RegexProgram.create( + pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT + ) + return prog + + +def test_contains_re( + pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat +): + got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) + expected = pa.compute.match_substring_regex( + pa_target_col, pa_target_scalar.as_py() + ) + assert_column_eq(got, expected) From 3b734ec2fd591f037fe1d8f8ce424c7049cb5a3e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 04:41:01 -0700 Subject: [PATCH 18/48] Start migrating I/O to pylibcudf (#15899) xref #15162 Starts migrating cudf I/O cython to use pylibcudf APIs, starting with avro. Authors: - Thomas Li (https://github.com/lithomas1) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15899 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../user_guide/api_docs/pylibcudf/io/avro.rst | 6 + .../api_docs/pylibcudf/io/index.rst | 18 +++ python/cudf/cudf/_lib/avro.pyx | 50 ++----- python/cudf/cudf/_lib/csv.pyx | 8 +- python/cudf/cudf/_lib/parquet.pyx | 2 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + .../cudf/_lib/pylibcudf/io/CMakeLists.txt | 25 ++++ .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd | 4 + .../cudf/cudf/_lib/pylibcudf/io/__init__.py | 4 + python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 12 ++ python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 58 +++++++++ python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 29 +++++ python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 110 ++++++++++++++++ .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd | 6 +- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 58 ++++----- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 11 ++ .../cudf/cudf/pylibcudf_tests/common/utils.py | 17 +++ python/cudf/cudf/pylibcudf_tests/test_avro.py | 123 ++++++++++++++++++ .../cudf/pylibcudf_tests/test_source_info.py | 69 ++++++++++ 21 files changed, 541 insertions(+), 72 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index b6ad1157511..870ed8856d1 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -17,6 +17,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + io/index.rst join lists merge diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst new file mode 100644 index 00000000000..495bd505fdc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst @@ -0,0 +1,6 @@ +==== +Avro +==== + +.. automodule:: cudf._lib.pylibcudf.io.avro + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst new file mode 100644 index 00000000000..0d53ac92db9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -0,0 +1,18 @@ +=== +I/O +=== + +I/O Utility Classes +=================== + +.. automodule:: cudf._lib.pylibcudf.io.types + :members: + + +I/O Functions +============= + +.. toctree:: + :maxdepth: 1 + + avro diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index ae17a5f1ab6..3c132b22880 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -1,20 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from cudf._lib.utils cimport data_from_pylibcudf_io -from cudf._lib.io.utils cimport make_source_info -from cudf._lib.pylibcudf.libcudf.io.avro cimport ( - avro_reader_options, - read_avro as libcudf_read_avro, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import SourceInfo -cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): +cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): """ Cython function to call libcudf read_avro, see `read_avro`. @@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): if not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < -1: - raise TypeError("skip_rows must be an int >= -1") - - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options options = move( - avro_reader_options.builder(make_source_info([datasource])) - .columns(c_columns) - .skip_rows( skip_rows) - .num_rows( num_rows) - .build() + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") + + return data_from_pylibcudf_io( + plc.io.avro.read_avro( + SourceInfo([datasource]), + columns, + skip_rows, + num_rows + ) ) - - cdef table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_avro(options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aa771295607..0b0bbdb2589 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options( ) if quoting == 1: - c_quoting = quote_style.QUOTE_ALL + c_quoting = quote_style.ALL elif quoting == 2: - c_quoting = quote_style.QUOTE_NONNUMERIC + c_quoting = quote_style.NONNUMERIC elif quoting == 3: - c_quoting = quote_style.QUOTE_NONE + c_quoting = quote_style.NONE else: # Default value - c_quoting = quote_style.QUOTE_MINIMAL + c_quoting = quote_style.MINIMAL cdef csv_reader_options csv_reader_options_c = move( csv_reader_options.builder(c_source_info) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f0eef9be124..ac592cedaac 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -491,7 +491,7 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( + cdef cudf_io_types.dictionary_policy dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d0676f6def..6beb7b0f506 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop) add_subdirectory(libcudf) add_subdirectory(strings) +add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..2cfec101bab --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources avro.pyx types.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf +) + +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd new file mode 100644 index 00000000000..250292746c1 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport avro, types +from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py new file mode 100644 index 00000000000..5242c741911 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import avro, types +from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd new file mode 100644 index 00000000000..3695f36a6e7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = *, + size_type skip_rows = *, + size_type num_rows = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx new file mode 100644 index 00000000000..946e0896fc8 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport ( + avro_reader_options, + read_avro as cpp_read_avro, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = None, + size_type skip_rows = 0, + size_type num_rows = -1 +): + """ + Reads an Avro dataset into a set of columns. + + Parameters + ---------- + source_info: SourceInfo + The SourceInfo object to read the avro dataset from. + columns: list, default None + Optional columns to read, if not provided, reads all columns in the file. + skip_rows: size_type, default 0 + The number of rows to skip. + num_rows: size_type, default -1 + The number of rows to read, after skipping rows. + If -1 is passed, all rows will be read. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata that was read in. + """ + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + cdef avro_reader_options avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) + + with nogil: + c_result = move(cpp_read_avro(avro_opts)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd new file mode 100644 index 00000000000..aa846a47343 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, + column_name_info, + compression_type, + dictionary_policy, + io_type, + partition_info, + quote_style, + sink_info, + source_info, + statistics_freq, + table_input_metadata, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.table cimport Table + + +cdef class TableWithMetadata: + cdef public Table tbl + cdef table_metadata metadata + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx new file mode 100644 index 00000000000..cd777232b33 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + host_buffer, + source_info, + table_with_metadata, +) + +import errno +import io +import os + + +cdef class TableWithMetadata: + """A container holding a table and its associated metadata + (e.g. column names) + + For details, see :cpp:class:`cudf::io::table_with_metadata`. + """ + + @property + def columns(self): + """ + Return a list containing the columns of the table + """ + return self.tbl.columns() + + @property + def column_names(self): + """ + Return a list containing the column names of the table + """ + cdef list names = [] + for col_info in self.metadata.schema_info: + # TODO: Handle nesting (columns with child columns) + assert col_info.children.size() == 0, "Child column names are not handled!" + names.append(col_info.name.decode()) + return names + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): + """Create a Python TableWithMetadata from a libcudf table_with_metadata""" + cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.metadata = tbl_with_meta.metadata + return out + +cdef class SourceInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::source_info`. + + Parameters + ---------- + sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] + A homogeneous list of sources (this can be a string filename, + an os.PathLike, bytes, or an io.BytesIO) to read from. + + Mixing different types of sources will raise a `ValueError`. + """ + + def __init__(self, list sources): + if not sources: + raise ValueError("Need to pass at least one source") + + cdef vector[string] c_files + + if isinstance(sources[0], (os.PathLike, str)): + c_files.reserve(len(sources)) + + for src in sources: + if not isinstance(src, (os.PathLike, str)): + raise ValueError("All sources must be of the same type!") + if not os.path.isfile(src): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + src) + + c_files.push_back( str(src).encode()) + + self.c_obj = move(source_info(c_files)) + return + + # TODO: host_buffer is deprecated API, use host_span instead + cdef vector[host_buffer] c_host_buffers + cdef const unsigned char[::1] c_buffer + cdef bint empty_buffer = False + if isinstance(sources[0], bytes): + empty_buffer = True + for buffer in sources: + if not isinstance(buffer, bytes): + raise ValueError("All sources must be of the same type!") + if (len(buffer) > 0): + c_buffer = buffer + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + empty_buffer = False + elif isinstance(sources[0], io.BytesIO): + for bio in sources: + if not isinstance(bio, io.BytesIO): + raise ValueError("All sources must be of the same type!") + c_buffer = bio.getbuffer() # check if empty? + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + + self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd index e553515dfdf..25f91849dea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd @@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& compression( cudf_io_types.compression_type comp ) except + - orc_writer_options_builder& enable_statistics(bool val) except + + orc_writer_options_builder& enable_statistics( + cudf_io_types.statistics_freq val + ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + orc_writer_options_builder& row_index_stride(size_type val) except + @@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type comp ) except + chunked_orc_writer_options_builder& enable_statistics( - bool val + cudf_io_types.statistics_freq val ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 38fae1df1e5..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/io/types.hpp" \ namespace "cudf::io" nogil: - ctypedef enum quote_style: - QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL" - QUOTE_ALL "cudf::io::quote_style::ALL" - QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC" - QUOTE_NONE "cudf::io::quote_style::NONE" - - ctypedef enum compression_type: - NONE "cudf::io::compression_type::NONE" - AUTO "cudf::io::compression_type::AUTO" - SNAPPY "cudf::io::compression_type::SNAPPY" - GZIP "cudf::io::compression_type::GZIP" - BZIP2 "cudf::io::compression_type::BZIP2" - BROTLI "cudf::io::compression_type::BROTLI" - ZIP "cudf::io::compression_type::ZIP" - XZ "cudf::io::compression_type::XZ" - ZLIB "cudf::io::compression_type::ZLIB" - LZ4 "cudf::io::compression_type::LZ4" - LZO "cudf::io::compression_type::LZO" - ZSTD "cudf::io::compression_type::ZSTD" - - ctypedef enum io_type: - FILEPATH "cudf::io::io_type::FILEPATH" - HOST_BUFFER "cudf::io::io_type::HOST_BUFFER" - VOID "cudf::io::io_type::VOID" - USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED" - - ctypedef enum statistics_freq: + cpdef enum class quote_style(int32_t): + MINIMAL + ALL + NONNUMERIC + NONE + + cpdef enum class compression_type(int32_t): + NONE + AUTO + SNAPPY + GZIP + BZIP2 + BROTLI + ZIP + XZ + ZLIB + LZ4 + LZO + ZSTD + + cpdef enum class io_type(int32_t): + FILEPATH + HOST_BUFFER + VOID + USER_IMPLEMENTED + + cpdef enum class statistics_freq(int32_t): STATISTICS_NONE = 0, STATISTICS_ROWGROUP = 1, STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, - ctypedef enum dictionary_policy: + cpdef enum class dictionary_policy(int32_t): NEVER = 0, ADAPTIVE = 1, ALWAYS = 2, cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding: + cpdef enum class column_encoding(int32_t): USE_DEFAULT = -1 DICTIONARY = 0 PLAIN = 1 diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index c5a1e7552b9..99850d549a1 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cdef data_from_pylibcudf_io(tbl_with_meta) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4cd48d6ed..de6b9f690b6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) +cdef data_from_pylibcudf_io(tbl_with_meta): + """ + Unpacks the TableWithMetadata from libcudf I/O + into a dict of columns and an Index (cuDF format) + """ + return _data_from_columns( + columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names=tbl_with_meta.column_names, + index_names=None + ) + cdef columns_from_table_view( table_view tv, object owners, diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index e00053529a8..54d38f1a8cf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: assert_column_eq(pa_col, plc_col) +def assert_table_and_meta_eq( + plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table +) -> None: + """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" + + plc_table = plc_table_w_meta.tbl + + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + # Check column name equality + assert plc_table_w_meta.column_names == pa_table.column_names + + def cudf_raises(expected_exception: BaseException, *args, **kwargs): # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions match = kwargs.get("match", None) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py new file mode 100644 index 00000000000..d6cd86768cd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -0,0 +1,123 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io +import itertools + +import fastavro +import pyarrow as pa +import pytest +from utils import assert_table_and_meta_eq + +import cudf._lib.pylibcudf as plc + +avro_dtype_pairs = [ + ("boolean", pa.bool_()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("bytes", pa.string()), + ("string", pa.string()), +] + + +@pytest.fixture( + scope="module", params=itertools.combinations(avro_dtype_pairs, 2) +) +def avro_dtypes(request): + return request.param + + +@pytest.fixture +def avro_dtype_data(avro_dtypes): + (avro_type1, _), (avro_type2, _) = avro_dtypes + + def _get_data(avro_type): + if avro_type == "boolean": + return [True, False, True] + elif avro_type in {"int", "long"}: + return [1, 2, -1] + elif avro_type in {"float", "double"}: + return [1.0, 3.1415, -3.1415] + elif avro_type == "bytes": + return [b"a", b"b", b"c"] + elif avro_type == "string": + return ["Hello", "World!", ""] + + return _get_data(avro_type1), _get_data(avro_type2) + + +@pytest.fixture( + params=[ + (0, 0), + (0, -1), + (1, -1), + (3, -1), + ] +) +def row_opts(request): + """ + (skip_rows, num_rows) combos for the avro reader + """ + return request.param + + +@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) +@pytest.mark.parametrize("nullable", [True, False]) +def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): + (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes + + avro_type1 = avro_type1 if not nullable else ["null", avro_type1] + avro_type2 = avro_type2 if not nullable else ["null", avro_type2] + + skip_rows, num_rows = row_opts + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "prop1", "type": avro_type1}, + {"name": "prop2", "type": avro_type2}, + ], + } + ) + + if nullable: + avro_dtype_data = ( + avro_dtype_data[0] + [None], + avro_dtype_data[1] + [None], + ) + + records = [ + {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + ] + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + + res = plc.io.avro.read_avro( + plc.io.types.SourceInfo([buffer]), + columns=columns, + skip_rows=skip_rows, + num_rows=num_rows, + ) + + expected = pa.Table.from_arrays( + [ + pa.array(avro_dtype_data[0], type=expected_type1), + pa.array(avro_dtype_data[1], type=expected_type2), + ], + names=["prop1", "prop2"], + ) + + # Adjust for skip_rows/num_rows in result + length = num_rows if num_rows != -1 else None + expected = expected.slice(skip_rows, length=length) + + # adjust for # of columns + if columns != []: + expected = expected.select(columns) + + assert_table_and_meta_eq(res, expected) diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py new file mode 100644 index 00000000000..71a3ecbcc30 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize( + "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] +) +def test_source_info_ctor(source, tmp_path): + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + source = str(file) + + plc.io.SourceInfo([source]) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["a.txt", "a.txt"], + [b"hello world", b"hello there"], + [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + ], +) +def test_source_info_ctor_multiple(sources, tmp_path): + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + + plc.io.SourceInfo(sources) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["awef.txt", b"hello world", io.BytesIO(b"hello world")], + [b"hello world", b"hello there", "awef.txt"], + [ + io.BytesIO(b"hello world"), + io.BytesIO(b"hello there"), + b"hello world", + ], + ], +) +def test_source_info_ctor_mixing_invalid(sources, tmp_path): + # Unlike the previous test + # don't create files so that they are missing + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + with pytest.raises(ValueError): + plc.io.SourceInfo(sources) From d1e511edc88deb7604bed71b2689d72da0aed19a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 15:19:06 +0100 Subject: [PATCH 19/48] Introduce `NamedColumn` concept in cudf-polars (#15914) Simplify name tracking in expression evaluation by only requiring names for columns when putting them in to a `DataFrame`. At the same time, this allows us to have one place where we broadcast-expand `Scalar`s to the size of the `DataFrame`, so we can expunge tracking them in the `DataFrame` itself. Additionally, adapt to minor changes on the polars side in terms of translating the DSL: we no longer need to handle CSE expressions specially, and sorting by multiple keys takes a list of `descending` flags, rather than a single bool as previously. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15914 --- .../cudf_polars/containers/__init__.py | 4 +- .../cudf_polars/containers/column.py | 78 ++++-- .../cudf_polars/containers/dataframe.py | 59 ++--- python/cudf_polars/cudf_polars/dsl/expr.py | 239 +++++++++++------- python/cudf_polars/cudf_polars/dsl/ir.py | 176 ++++++++----- .../cudf_polars/cudf_polars/dsl/translate.py | 106 +++++--- .../cudf_polars/testing/asserts.py | 6 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 3 +- .../cudf_polars/cudf_polars/utils/sorting.py | 12 +- python/cudf_polars/docs/overview.md | 101 +++++++- .../cudf_polars/tests/expressions/test_agg.py | 6 +- python/cudf_polars/tests/test_select.py | 21 ++ python/cudf_polars/tests/test_union.py | 5 - 13 files changed, 541 insertions(+), 275 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index ef9d9ca61b6..ee69e748eb5 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,8 +5,8 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "Scalar"] +__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"] -from cudf_polars.containers.column import Column +from cudf_polars.containers.column import Column, NamedColumn from cudf_polars.containers.dataframe import DataFrame from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 49034b5f5c8..575d15d3ece 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -13,24 +13,29 @@ if TYPE_CHECKING: from typing_extensions import Self -__all__: list[str] = ["Column"] +__all__: list[str] = ["Column", "NamedColumn"] class Column: - """A column, a name, and sortedness.""" + """A column with sortedness metadata.""" obj: plc.Column - name: str is_sorted: plc.types.Sorted order: plc.types.Order null_order: plc.types.NullOrder - def __init__(self, column: plc.Column, name: str): + def __init__( + self, + column: plc.Column, + *, + is_sorted: plc.types.Sorted = plc.types.Sorted.NO, + order: plc.types.Order = plc.types.Order.ASCENDING, + null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + ): self.obj = column - self.name = name - self.is_sorted = plc.types.Sorted.NO - self.order = plc.types.Order.ASCENDING - self.null_order = plc.types.NullOrder.BEFORE + self.is_sorted = is_sorted + self.order = order + self.null_order = null_order def sorted_like(self, like: Column, /) -> Self: """ @@ -81,22 +86,20 @@ def set_sorted( self.null_order = null_order return self - def copy(self, *, new_name: str | None = None) -> Self: + def copy(self) -> Self: """ - Return a shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. + A shallow copy of the column. Returns ------- New column sharing data with self. """ return type(self)( - self.obj, self.name if new_name is None else new_name - ).sorted_like(self) + self.obj, + is_sorted=self.is_sorted, + order=self.order, + null_order=self.null_order, + ) def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" @@ -117,3 +120,44 @@ def nan_count(self) -> int: plc.DataType(plc.TypeId.INT32), ) ).as_py() + + +class NamedColumn(Column): + """A column with a name.""" + + name: str + + def __init__( + self, + column: plc.Column, + name: str, + *, + is_sorted: plc.types.Sorted = plc.types.Sorted.NO, + order: plc.types.Order = plc.types.Order.ASCENDING, + null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + ) -> None: + super().__init__( + column, is_sorted=is_sorted, order=order, null_order=null_order + ) + self.name = name + + def copy(self, *, new_name: str | None = None) -> Self: + """ + A shallow copy of the column. + + Parameters + ---------- + new_name + Optional new name for the copied column. + + Returns + ------- + New column sharing data with self. + """ + return type(self)( + self.obj, + self.name if new_name is None else new_name, + is_sorted=self.is_sorted, + order=self.order, + null_order=self.null_order, + ) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index de21a280020..eeaf181be0c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -12,7 +12,7 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers.column import Column +from cudf_polars.containers.column import NamedColumn if TYPE_CHECKING: from collections.abc import Mapping, Sequence, Set @@ -21,7 +21,7 @@ import cudf - from cudf_polars.containers.scalar import Scalar + from cudf_polars.containers import Column __all__: list[str] = ["DataFrame"] @@ -30,26 +30,20 @@ class DataFrame: """A representation of a dataframe.""" - columns: list[Column] - scalars: list[Scalar] + columns: list[NamedColumn] table: plc.Table | None - def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: + def __init__(self, columns: Sequence[NamedColumn]) -> None: self.columns = list(columns) self._column_map = {c.name: c for c in self.columns} - self.scalars = list(scalars) - if len(scalars) == 0: - self.table = plc.Table([c.obj for c in columns]) - else: - self.table = None + self.table = plc.Table([c.obj for c in columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)(self.columns, self.scalars) + return type(self)(self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" - assert len(self.scalars) == 0 return pl.from_arrow( plc.interop.to_arrow( self.table, @@ -83,8 +77,10 @@ def num_rows(self) -> int: def from_cudf(cls, df: cudf.DataFrame) -> Self: """Create from a cudf dataframe.""" return cls( - [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()], - [], + [ + NamedColumn(c.to_pylibcudf(mode="read"), name) + for name, c in df._data.items() + ] ) @classmethod @@ -105,13 +101,16 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: Raises ------ - ValueError if the number of provided names does not match the - number of columns in the table. + ValueError + If the number of provided names does not match the + number of columns in the table. """ - # TODO: strict=True when we drop py39 if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") - return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) + return cls( + # TODO: strict=True when we drop py39 + [NamedColumn(c, name) for c, name in zip(table.columns(), names)] + ) def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None @@ -132,18 +131,20 @@ def sorted_like( Raises ------ - ValueError if there is a name mismatch between self and like. + ValueError + If there is a name mismatch between self and like. """ if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset self.columns = [ c.sorted_like(other) if c.name in subset else c + # TODO: strict=True when we drop py39 for c, other in zip(self.columns, like.columns) ] return self - def with_columns(self, columns: Sequence[Column]) -> Self: + def with_columns(self, columns: Sequence[NamedColumn]) -> Self: """ Return a new dataframe with extra columns. @@ -160,35 +161,31 @@ def with_columns(self, columns: Sequence[Column]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns], self.scalars) + return type(self)([*self.columns, *columns]) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)( - [c for c in self.columns if c.name not in names], self.scalars - ) + return type(self)([c for c in self.columns if c.name not in names]) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" want = set(names) if not want.issubset(self.column_names_set): raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names], self.scalars) + return type(self)([self._column_map[name] for name in names]) - def replace_columns(self, *columns: Column) -> Self: + def replace_columns(self, *columns: NamedColumn) -> Self: """Return a new dataframe with columns replaced by name.""" new = {c.name: c for c in columns} if not set(new).issubset(self.column_names_set): raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) + return type(self)([new.get(c.name, c) for c in self.columns]) def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)( - [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars - ) + return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) - def select_columns(self, names: Set[str]) -> list[Column]: + def select_columns(self, names: Set[str]) -> list[NamedColumn]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 7187a36f21c..c7c11cf6c68 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -26,11 +26,11 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column, Scalar +from cudf_polars.containers import Column, NamedColumn, Scalar from cudf_polars.utils import sorting if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Mapping, Sequence import polars.type_aliases as pl_types @@ -110,7 +110,7 @@ def get_hash(self) -> int: """ return hash((type(self), self._ctor_arguments(self.children))) - def __hash__(self): + def __hash__(self) -> int: """Hash of an expression with caching.""" try: return self._hash_value @@ -139,18 +139,18 @@ def is_equal(self, other: Any) -> bool: other.children ) - def __eq__(self, other): + def __eq__(self, other) -> bool: """Equality of expressions.""" if type(self) != type(other) or hash(self) != hash(other): return False else: return self.is_equal(other) - def __ne__(self, other): + def __ne__(self, other) -> bool: """Inequality of expressions.""" return not self.__eq__(other) - def __repr__(self): + def __repr__(self) -> str: """String representation of an expression with caching.""" try: return self._repr_value @@ -164,7 +164,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: # TODO: return type is a lie for Literal """ Evaluate this expression given a dataframe for context. @@ -185,15 +185,6 @@ def do_evaluate( Do not call this function directly, but rather :meth:`evaluate` which handles the mapping lookups. - The typed return value of :class:`Column` is not true when - evaluating :class:`Literal` nodes (which instead produce - :class:`Scalar` objects). However, these duck-type to having a - pylibcudf container object inside them, and usually they end - up appearing in binary expressions which pylibcudf handles - appropriately since there are overloads for (column, scalar) - pairs. We don't have to handle (scalar, scalar) in binops - since the polars optimizer has a constant-folding pass. - Returns ------- Column representing the evaluation of the expression (or maybe @@ -201,9 +192,10 @@ def do_evaluate( Raises ------ - NotImplementedError if we couldn't evaluate the expression. - Ideally all these are returned during translation to the IR, - but for now we are not perfect. + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. """ raise NotImplementedError(f"Evaluation of {type(self).__name__}") @@ -212,7 +204,7 @@ def evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: # TODO: return type is a lie for Literal """ Evaluate this expression given a dataframe for context. @@ -234,16 +226,26 @@ def evaluate( this method provides logic to handle lookups in the substitution mapping. + The typed return value of :class:`Column` is not true when + evaluating :class:`Literal` nodes (which instead produce + :class:`Scalar` objects). However, these duck-type to having a + pylibcudf container object inside them, and usually they end + up appearing in binary expressions which pylibcudf handles + appropriately since there are overloads for (column, scalar) + pairs. We don't have to handle (scalar, scalar) in binops + since the polars optimizer has a constant-folding pass. + Returns ------- Column representing the evaluation of the expression (or maybe - a scalar, annoying!). + a scalar). Raises ------ - NotImplementedError if we couldn't evaluate the expression. - Ideally all these are returned during translation to the IR, - but for now we are not perfect. + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. """ if mapping is None: return self.do_evaluate(df, context=context, mapping=mapping) @@ -269,41 +271,74 @@ def collect_agg(self, *, depth: int) -> AggInfo: Raises ------ - NotImplementedError if we can't currently perform the - aggregation request (for example nested aggregations like - ``a.max().min()``). + NotImplementedError + If we can't currently perform the aggregation request, for + example nested aggregations like ``a.max().min()``. """ raise NotImplementedError( f"Collecting aggregation info for {type(self).__name__}" ) -class NamedExpr(Expr): - __slots__ = ("name", "children") - _non_child = ("dtype", "name") +class NamedExpr: + # NamedExpr does not inherit from Expr since it does not appear + # when evaluating expressions themselves, only when constructing + # named return values in dataframe (IR) nodes. + __slots__ = ("name", "value") - def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None: - super().__init__(dtype) + def __init__(self, name: str, value: Expr) -> None: self.name = name - self.children = (value,) + self.value = value + + def __hash__(self) -> int: + """Hash of the expression.""" + return hash((type(self), self.name, self.value)) + + def __repr__(self) -> str: + """Repr of the expression.""" + return f"NamedExpr({self.name}, {self.value}" + + def __eq__(self, other) -> bool: + """Equality of two expressions.""" + return ( + type(self) is type(other) + and self.name == other.name + and self.value == other.value + ) - def do_evaluate( + def __ne__(self, other) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, - ) -> Column: + mapping: Mapping[Expr, Column] | None = None, + ) -> NamedColumn: """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - return Column( - child.evaluate(df, context=context, mapping=mapping).obj, self.name - ) + obj = self.value.evaluate(df, context=context, mapping=mapping) + if isinstance(obj, Scalar): + return NamedColumn( + plc.Column.from_scalar(obj.obj, 1), + self.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) + else: + return NamedColumn( + obj.obj, + self.name, + is_sorted=obj.is_sorted, + order=obj.order, + null_order=obj.null_order, + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" - (value,) = self.children - return value.collect_agg(depth=depth) + return self.value.collect_agg(depth=depth) class Literal(Expr): @@ -311,21 +346,21 @@ class Literal(Expr): _non_child = ("dtype", "value") value: pa.Scalar - def __init__(self, dtype: plc.DataType, value: Any) -> None: + def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None: super().__init__(dtype) - self.value = pa.scalar(value) + assert value.type == plc.interop.to_arrow(dtype) + self.value = value def do_evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - # TODO: obey dtype - obj = plc.interop.from_arrow(self.value) - return Scalar(obj) # type: ignore + # datatype of pyarrow scalar is correct by construction. + return Scalar(plc.interop.from_arrow(self.value)) # type: ignore class Col(Expr): @@ -342,7 +377,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" return df._column_map[self.name] @@ -358,7 +393,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" # TODO: type is wrong, and dtype @@ -415,8 +450,7 @@ def _distinct( [source_value], indices, plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), - ).columns()[0], - column.name, + ).columns()[0] ) _BETWEEN_OPS: ClassVar[ @@ -448,7 +482,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" columns = [ @@ -467,18 +501,18 @@ def do_evaluate( ) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns - return Column(plc.unary.is_null(column.obj), column.name) + return Column(plc.unary.is_null(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNotNull: (column,) = columns - return Column(plc.unary.is_valid(column.obj), column.name) + return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj), column.name) + return Column(plc.unary.is_nan(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNotNan: # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj), column.name) + return Column(plc.unary.is_not_nan(column.obj)) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -528,7 +562,6 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for all_horizontal") return Column( @@ -539,11 +572,9 @@ def do_evaluate( output_type=self.dtype, ), (c.obj for c in columns), - ), - name, + ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for any_horizontal") return Column( @@ -554,8 +585,7 @@ def do_evaluate( output_type=self.dtype, ), (c.obj for c in columns), - ), - name, + ) ) elif self.name == pl_expr.BooleanFunction.IsBetween: column, lo, hi = columns @@ -571,8 +601,7 @@ def do_evaluate( ), plc.binaryop.BinaryOperator.LOGICAL_AND, self.dtype, - ), - column.name, + ) ) else: raise NotImplementedError(f"BooleanFunction {self.name}") @@ -606,7 +635,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" columns = [ @@ -615,20 +644,16 @@ def do_evaluate( ] if self.name == pl_expr.StringFunction.Lowercase: (column,) = columns - return Column(plc.strings.case.to_lower(column.obj), column.name) + return Column(plc.strings.case.to_lower(column.obj)) elif self.name == pl_expr.StringFunction.Uppercase: (column,) = columns - return Column(plc.strings.case.to_upper(column.obj), column.name) + return Column(plc.strings.case.to_upper(column.obj)) elif self.name == pl_expr.StringFunction.EndsWith: column, suffix = columns - return Column( - plc.strings.find.ends_with(column.obj, suffix.obj), column.name - ) + return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) elif self.name == pl_expr.StringFunction.StartsWith: column, suffix = columns - return Column( - plc.strings.find.starts_with(column.obj, suffix.obj), column.name - ) + return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) else: raise NotImplementedError(f"StringFunction {self.name}") @@ -649,19 +674,22 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( - [descending], nulls_last=nulls_last, num_keys=1 + [descending], nulls_last=[nulls_last], num_keys=1 ) do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort table = do_sort(plc.Table([column.obj]), order, null_order) - return Column(table.columns()[0], column.name).set_sorted( - is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] + return Column( + table.columns()[0], + is_sorted=plc.types.Sorted.YES, + order=order[0], + null_order=null_order[0], ) @@ -672,7 +700,7 @@ class SortBy(Expr): def __init__( self, dtype: plc.DataType, - options: tuple[bool, bool, tuple[bool]], + options: tuple[bool, tuple[bool], tuple[bool]], column: Expr, *by: Expr, ): @@ -685,7 +713,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" column, *by = ( @@ -700,7 +728,7 @@ def do_evaluate( table = do_sort( plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order ) - return Column(table.columns()[0], column.name) + return Column(table.columns()[0]) class Gather(Expr): @@ -716,7 +744,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" values, indices = ( @@ -741,7 +769,7 @@ def do_evaluate( bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK obj = indices.obj table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) - return Column(table.columns()[0], values.name) + return Column(table.columns()[0]) class Filter(Expr): @@ -757,7 +785,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" values, mask = ( @@ -767,7 +795,7 @@ def do_evaluate( table = plc.stream_compaction.apply_boolean_mask( plc.Table([values.obj]), mask.obj ) - return Column(table.columns()[0], values.name).sorted_like(values) + return Column(table.columns()[0]).sorted_like(values) class RollingWindow(Expr): @@ -803,14 +831,12 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like( - column - ) + return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -907,7 +933,9 @@ def _reduce( plc.reduce.reduce(column.obj, request, self.dtype), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) def _count(self, column: Column) -> Column: @@ -921,7 +949,9 @@ def _count(self, column: Column) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) def _min(self, column: Column, *, propagate_nans: bool) -> Column: @@ -933,7 +963,9 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) if column.nan_count > 0: column = column.mask_nans() @@ -948,25 +980,37 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: ), 1, ), - column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, ) if column.nan_count > 0: column = column.mask_nans() return self._reduce(column, request=plc.aggregation.max()) def _first(self, column: Column) -> Column: - return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name) + return Column( + plc.copying.slice(column.obj, [0, 1])[0], + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) def _last(self, column: Column) -> Column: n = column.obj.size() - return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) + return Column( + plc.copying.slice(column.obj, [n - 1, n])[0], + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) def do_evaluate( self, df, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" if context is not ExecutionContext.FRAME: @@ -1018,7 +1062,7 @@ def do_evaluate( df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, + mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" left, right = ( @@ -1027,7 +1071,6 @@ def do_evaluate( ) return Column( plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), - "what", ) def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index f8441b793b5..0a72cbd9f83 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -30,7 +30,7 @@ import cudf._lib.pylibcudf as plc import cudf_polars.dsl.expr as expr -from cudf_polars.containers import Column, DataFrame +from cudf_polars.containers import DataFrame, NamedColumn from cudf_polars.utils import sorting if TYPE_CHECKING: @@ -59,6 +59,38 @@ ] +def broadcast( + *columns: NamedColumn, target_length: int | None = None +) -> list[NamedColumn]: + lengths = {column.obj.size() for column in columns} + if len(lengths - {1}) > 1: + raise RuntimeError("Mismatching column lengths") + if lengths == {1}: + if target_length is None: + return list(columns) + nrows = target_length + elif len(lengths) == 1: + if target_length is not None: + assert target_length in lengths + return list(columns) + else: + (nrows,) = lengths - {1} + if target_length is not None: + assert target_length == nrows + return [ + column + if column.obj.size() != 1 + else NamedColumn( + plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows), + column.name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.BEFORE, + ) + for column in columns + ] + + @dataclass(slots=True) class IR: """Abstract plan node, representing an unevaluated dataframe.""" @@ -83,9 +115,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: Raises ------ - NotImplementedError if we couldn't evaluate things. Ideally - this should not occur, since the translation phase should pick - up things that we cannot handle. + NotImplementedError + If we couldn't evaluate things. Ideally this should not occur, + since the translation phase should pick up things that we + cannot handle. """ raise NotImplementedError @@ -96,7 +129,7 @@ class PythonScan(IR): options: Any """Arbitrary options.""" - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" @@ -117,7 +150,7 @@ class Scan(IR): - ``row_index: tuple[name, offset] | None``: Add an integer index column with given name. """ - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self): @@ -153,14 +186,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = Column( - plc.filling.sequence(df.num_rows, init, step), name - ).set_sorted( + index = NamedColumn( + plc.filling.sequence(df.num_rows, init, step), + name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - df = DataFrame([index, *df.columns], []) + df = DataFrame([index, *df.columns]) # TODO: should be true, but not the case until we get # cudf-classic out of the loop for IO since it converts date32 # to datetime. @@ -171,7 +204,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: if self.predicate is None: return df else: - mask = self.predicate.evaluate(df) + (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) @@ -208,7 +241,7 @@ class DataFrameScan(IR): """Polars LazyFrame object.""" projection: list[str] """List of columns to project out.""" - predicate: expr.Expr | None + predicate: expr.NamedExpr | None """Mask to apply.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @@ -231,7 +264,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values()) ) if self.predicate is not None: - mask = self.predicate.evaluate(df) + (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) else: return df @@ -243,20 +276,15 @@ class Select(IR): df: IR """Input dataframe.""" - cse: list[expr.Expr] - """ - List of common subexpressions that will appear in the selected expressions. - - These must be evaluated before the returned expressions. - """ - expr: list[expr.Expr] + expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - df = df.with_columns([e.evaluate(df) for e in self.cse]) - return DataFrame([e.evaluate(df) for e in self.expr], []) + # Handle any broadcasting + columns = broadcast(*(e.evaluate(df) for e in self.expr)) + return DataFrame(columns) @dataclass(slots=True) @@ -269,13 +297,15 @@ class Reduce(IR): df: IR """Input dataframe.""" - expr: list[expr.Expr] + expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return DataFrame([e.evaluate(df) for e in self.expr], []) + columns = broadcast(*(e.evaluate(df) for e in self.expr)) + assert all(column.obj.size() == 1 for column in columns) + return DataFrame(columns) def placeholder_column(n: int): @@ -314,9 +344,9 @@ class GroupBy(IR): df: IR """Input dataframe.""" - agg_requests: list[expr.Expr] + agg_requests: list[expr.NamedExpr] """List of expressions to evaluate groupwise.""" - keys: list[expr.Expr] + keys: list[expr.NamedExpr] """List of expressions forming the keys.""" maintain_order: bool """Should the order of the input dataframe be maintained?""" @@ -339,9 +369,10 @@ def check_agg(agg: expr.Expr) -> int: Raises ------ - NotImplementedError for unsupported expression nodes. + NotImplementedError + For unsupported expression nodes. """ - if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): if agg.name == "implode": @@ -358,14 +389,16 @@ def __post_init__(self): raise NotImplementedError("Maintaining order in groupby") if self.options.rolling: raise NotImplementedError("rolling window/groupby") - if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): + if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - keys = [k.evaluate(df) for k in self.keys] + keys = broadcast( + *(k.evaluate(df) for k in self.keys), target_length=df.num_rows + ) # TODO: use sorted information, need to expose column_order # and null_precedence in pylibcudf groupby constructor # sorted = ( @@ -379,7 +412,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) # TODO: uniquify requests = [] - replacements = [] + replacements: list[expr.Expr] = [] for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: @@ -389,17 +422,20 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - raw_columns = [] + # TODO: names + raw_columns: list[NamedColumn] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(Column(column, f"column{i}")) + raw_columns.append(NamedColumn(column, f"tmp{i}")) mapping = dict(zip(replacements, raw_columns)) - result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)] - result_subs = DataFrame(raw_columns, []) + result_keys = [ + NamedColumn(gk, k.name) for gk, k in zip(group_keys.columns(), keys) + ] + result_subs = DataFrame(raw_columns) results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame([*result_keys, *results], []).slice(self.options.slice) + return DataFrame([*result_keys, *results]).slice(self.options.slice) @dataclass(slots=True) @@ -410,9 +446,9 @@ class Join(IR): """Left frame.""" right: IR """Right frame.""" - left_on: list[expr.Expr] + left_on: list[expr.NamedExpr] """List of expressions used as keys in the left frame.""" - right_on: list[expr.Expr] + right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ Literal["inner", "left", "full", "leftsemi", "leftanti"], @@ -479,8 +515,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) right = self.right.evaluate(cache=cache) - left_on = DataFrame([e.evaluate(left) for e in self.left_on], []) - right_on = DataFrame([e.evaluate(right) for e in self.right_on], []) + left_on = DataFrame( + broadcast( + *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows + ) + ) + right_on = DataFrame( + broadcast( + *(e.evaluate(right) for e in self.right_on), + target_length=right.num_rows, + ) + ) how, join_nulls, zlice, suffix, coalesce = self.options null_equality = ( plc.types.NullEquality.EQUAL @@ -510,7 +555,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: if coalesce and how != "inner": left = left.replace_columns( *( - Column( + NamedColumn( plc.replace.replace_nulls(left_col.obj, right_col.obj), left_col.name, ) @@ -538,20 +583,18 @@ class HStack(IR): df: IR """Input dataframe.""" - cse: list[expr.Expr] - """ - List of common subexpressions that will appear in the selected expressions. - - These must be evaluated before the returned expressions. - """ - columns: list[expr.Expr] + columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse]) - return df.with_columns([c.evaluate(ctx) for c in self.columns]) + columns = [c.evaluate(df) for c in self.columns] + # TODO: a bit of a hack, should inherit the should_broadcast + # property of polars' ProjectionOptions on the hstack node. + if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns): + columns = broadcast(*columns, target_length=df.num_rows) + return df.with_columns(columns) @dataclass(slots=True) @@ -614,7 +657,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: plc.types.NanEquality.ALL_EQUAL, ) result = DataFrame( - [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], [] + [ + NamedColumn(c, old.name).sorted_like(old) + for c, old in zip(table.columns(), df.columns) + ] ) if keys_sorted or self.stable: result = result.sorted_like(df) @@ -627,7 +673,7 @@ class Sort(IR): df: IR """Input.""" - by: list[expr.Expr] + by: list[expr.NamedExpr] """List of expressions to produce sort keys.""" do_sort: Callable[..., plc.Table] """pylibcudf sorting function.""" @@ -642,7 +688,7 @@ def __init__( self, schema: dict, df: IR, - by: list[expr.Expr], + by: list[expr.NamedExpr], options: Any, zlice: tuple[int, int] | None, ): @@ -661,7 +707,9 @@ def __init__( def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - sort_keys = [k.evaluate(df) for k in self.by] + sort_keys = broadcast( + *(k.evaluate(df) for k in self.by), target_length=df.num_rows + ) names = {c.name: i for i, c in enumerate(df.columns)} # TODO: More robust identification here. keys_in_result = [ @@ -675,7 +723,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: self.order, self.null_order, ) - columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] + columns = [ + NamedColumn(c, old.name) for c, old in zip(table.columns(), df.columns) + ] # If a sort key is in the result table, set the sortedness property for k, i in enumerate(keys_in_result): columns[i] = columns[i].set_sorted( @@ -683,7 +733,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: order=self.order[k], null_order=self.null_order[k], ) - return DataFrame(columns, []).slice(self.zlice) + return DataFrame(columns).slice(self.zlice) @dataclass(slots=True) @@ -709,13 +759,14 @@ class Filter(IR): df: IR """Input.""" - mask: expr.Expr + mask: expr.NamedExpr """Expression evaluating to a mask.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return df.filter(self.mask.evaluate(df)) + (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) + return df.filter(mask) @dataclass(slots=True) @@ -729,7 +780,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # This can reorder things. - return df.select(list(self.schema.keys())) + columns = broadcast( + *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + ) + return DataFrame(columns) @dataclass(slots=True) @@ -856,10 +910,8 @@ class HConcat(IR): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] - columns, scalars = zip(*((df.columns, df.scalars) for df in dfs)) return DataFrame( - list(itertools.chain.from_iterable(columns)), - list(itertools.chain.from_iterable(scalars)), + list(itertools.chain.from_iterable(df.columns for df in dfs)), ) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 9a301164beb..641176daff4 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -18,11 +18,25 @@ from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes -__all__ = ["translate_ir", "translate_expr"] +__all__ = ["translate_ir", "translate_named_expr"] class set_node(AbstractContextManager): - """Run a block with current node set in the visitor.""" + """ + Run a block with current node set in the visitor. + + Parameters + ---------- + visitor + The internal Rust visitor object + n + The node to set as the current root. + + Notes + ----- + This is useful for translating expressions with a given node + active, restoring the node when the block exits. + """ __slots__ = ("n", "visitor") @@ -52,7 +66,7 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> return ir.PythonScan( schema, node.options, - translate_expr(visitor, n=node.predicate) + translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, ) @@ -65,7 +79,7 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: node.scan_type, node.paths, node.file_options, - translate_expr(visitor, n=node.predicate) + translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, ) @@ -84,7 +98,7 @@ def _( schema, node.df, node.projection, - translate_expr(visitor, n=node.selection) + translate_named_expr(visitor, n=node.selection) if node.selection is not None else None, ) @@ -94,17 +108,16 @@ def _( def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] - exprs = [translate_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, cse_exprs, exprs) + exprs = [translate_named_expr(visitor, n=e) for e in node.expr] + return ir.Select(schema, inp, exprs) @_translate_ir.register def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - aggs = [translate_expr(visitor, n=e) for e in node.aggs] - keys = [translate_expr(visitor, n=e) for e in node.keys] + aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] + keys = [translate_named_expr(visitor, n=e) for e in node.keys] return ir.GroupBy( schema, inp, @@ -122,10 +135,10 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: # input active. with set_node(visitor, node.input_left): inp_left = translate_ir(visitor, n=None) - left_on = [translate_expr(visitor, n=e) for e in node.left_on] + left_on = [translate_named_expr(visitor, n=e) for e in node.left_on] with set_node(visitor, node.input_right): inp_right = translate_ir(visitor, n=None) - right_on = [translate_expr(visitor, n=e) for e in node.right_on] + right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) @@ -133,16 +146,15 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs] - exprs = [translate_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, cse_exprs, exprs) + exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, exprs) @_translate_ir.register def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - exprs = [translate_expr(visitor, n=e) for e in node.expr] + exprs = [translate_named_expr(visitor, n=e) for e in node.expr] return ir.Reduce(schema, inp, exprs) @@ -159,7 +171,7 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - by = [translate_expr(visitor, n=e) for e in node.by_column] + by = [translate_named_expr(visitor, n=e) for e in node.by_column] return ir.Sort(schema, inp, by, node.sort_options, node.slice) @@ -172,7 +184,7 @@ def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - mask = translate_expr(visitor, n=node.predicate) + mask = translate_named_expr(visitor, n=node.predicate) return ir.Filter(schema, inp, mask) @@ -234,8 +246,8 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: Raises ------ - NotImplementedError if we can't translate the nodes due to - unsupported functionality. + NotImplementedError + If we can't translate the nodes due to unsupported functionality. """ ctx: AbstractContextManager = ( set_node(visitor, n) if n is not None else noop_context @@ -246,17 +258,41 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: return _translate_ir(node, visitor, schema) +def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr: + """ + Translate a polars-internal named expression IR object into our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Node to translate, a named expression node. + + Returns + ------- + Translated IR object. + + Notes + ----- + The datatype of the internal expression will be obtained from the + visitor by calling ``get_dtype``, for this to work properly, the + caller should arrange that the expression is translated with the + node that it references "active" for the visitor (see :class:`set_node`). + + Raises + ------ + NotImplementedError + If any translation fails due to unsupported functionality. + """ + return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + + @singledispatch def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: raise NotImplementedError(f"Translation for {type(node).__name__}") -@_translate_expr.register -def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr: - e = translate_expr(visitor, n=node.node) - return expr.NamedExpr(dtype, node.output_name, e) - - @_translate_expr.register def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data @@ -375,7 +411,7 @@ def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: return expr.Len(dtype) -def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: +def translate_expr(visitor: Any, *, n: int) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. @@ -384,8 +420,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: visitor Polars NodeTraverser object n - Node to translate, either an integer referencing a polars - internal node, or a named expression node. + Node to translate, an integer referencing a polars internal node. Returns ------- @@ -393,14 +428,9 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: Raises ------ - NotImplementedError if any translation fails due to unsupported functionality. + NotImplementedError + If any translation fails due to unsupported functionality. """ - if isinstance(n, pl_expr.PyExprIR): - # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown - assert not isinstance(n, int) - node = n - dtype = dtypes.from_polars(visitor.get_dtype(node.node)) - else: - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) + node = visitor.view_expression(n) + dtype = dtypes.from_polars(visitor.get_dtype(n)) return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index a6e26a6425c..2fbfa971fef 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -23,7 +23,7 @@ def assert_gpu_result_equal( *, check_row_order: bool = True, check_column_order: bool = True, - check_dtype: bool = True, + check_dtypes: bool = True, check_exact: bool = True, rtol: float = 1e-05, atol: float = 1e-08, @@ -40,7 +40,7 @@ def assert_gpu_result_equal( Expect rows to be in same order check_column_order Expect columns to be in same order - check_dtype + check_dtypes Expect dtypes to match check_exact Require exact equality for floats, if `False` compare using @@ -68,7 +68,7 @@ def assert_gpu_result_equal( got, check_row_order=check_row_order, check_column_order=check_column_order, - check_dtype=check_dtype, + check_dtypes=check_dtypes, check_exact=check_exact, rtol=rtol, atol=atol, diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index bede0de3c9f..7b0049daf11 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -32,7 +32,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: Raises ------ - NotImplementedError for unsupported conversions. + NotImplementedError + For unsupported conversions. """ if isinstance(dtype, pl.Boolean): return plc.DataType(plc.TypeId.BOOL8) diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index b3ecfdd3dd4..d35459db20d 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -14,7 +14,7 @@ def sort_order( - descending: Sequence[bool], *, nulls_last: bool, num_keys: int + descending: Sequence[bool], *, nulls_last: Sequence[bool], num_keys: int ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: """ Produce sort order arguments. @@ -36,14 +36,18 @@ def sort_order( # Mimicking polars broadcast handling of descending if num_keys > (n := len(descending)) and n == 1: descending = [descending[0]] * num_keys + if num_keys > (n := len(nulls_last)) and n == 1: + nulls_last = [nulls_last[0]] * num_keys column_order = [ plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING for d in descending ] null_precedence = [] - for asc in column_order: - if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last): + # TODO: use strict=True when we drop py39 + assert len(descending) == len(nulls_last) + for asc, null_last in zip(column_order, nulls_last): + if (asc == plc.types.Order.ASCENDING) ^ (not null_last): null_precedence.append(plc.types.NullOrder.AFTER) - elif (asc == plc.types.Order.ASCENDING) ^ nulls_last: + elif (asc == plc.types.Order.ASCENDING) ^ null_last: null_precedence.append(plc.types.NullOrder.BEFORE) return column_order, null_precedence diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index cbf012f5881..b50d01c26db 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -34,6 +34,8 @@ pip install --upgrade uv uv pip install --upgrade -r py-polars/requirements-dev.txt ``` +> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster! + Now we have the necessary machinery to build polars ```sh cd py-polars @@ -57,7 +59,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -pip install --no-deps -e . +uv pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -96,6 +98,21 @@ This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU execution. +If you want to fail during translation, set the keyword argument +`raise_on_fail` to `True`: + +```python +from functools import partial +from cudf_polars.callback import execute_with_cudf + +result = q.collect( + post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) +) +``` + +This is mostly useful when writing tests, since in that case we want +any failures to propagate, rather than falling back to the CPU mode. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -153,22 +170,84 @@ the logical plan in any case, so is reasonably natural. # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have three (in +around their pylibcudf counterparts. We have four (in `cudf_polars/containers/`): -1. Scalar (a wrapper around a pylibcudf Scalar) -2. Column (a wrapper around a pylibcudf Column) -3. DataFrame (a wrapper around a pylibcudf Table) +1. `Scalar` (a wrapper around a pylibcudf `Scalar`) +2. `Column` (a wrapper around a pylibcudf `Column`) +3. `NamedColumn` a `Column` with an additional name +4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `Column`s which each hold -data plus a string `name`, along with a collection of `Scalar`s (this -might go away). +speaking, a `DataFrame` is just a list of `NamedColumn`s which each +hold a `Column` plus a string `name`. `NamedColumn`s are only ever +constructed via `NamedExpr`s, which are the top-level expression node +that lives inside an `IR` node. This means that the expression +evaluator never has to concern itself with column names: columns are +only ever decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they -are sorted). +are sorted). We could imagine tracking more metadata, like minimum and +maximum, though perhaps that is better left to libcudf itself. We offer some utility methods for transferring metadata when constructing new dataframes and columns, both `DataFrame` and `Column` -offer a `with_metadata(*, like: Self)` call which copies metadata from -the template. +offer a `sorted_like(like: Self)` call which copies metadata from the +template. + +All methods on containers that modify in place should return `self`, +to facilitate use in a ["fluent" +style](https://en.wikipedia.org/wiki/Fluent_interface). It makes it +much easier to write iteration over objects and collect the results if +everyone always returns a value. + +# Writing tests + +We use `pytest`, tests live in the `tests/` subdirectory, +organisationally the top-level test files each handle one of the `IR` +nodes. The goal is that they are parametrized over all the options +each node will handle, to have reasonable coverage. Tests of +expression functionality should live in `tests/expressions/`. + +To write a test an assert correctness, build a lazyframe as a query, +and then use the utility assertion function from +`cudf_polars.testing.asserts`. This runs the query using both the cudf +executor and polars CPU, and checks that they match. So: + +```python +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_whatever(): + query = pl.LazyFrame(...).(...) + + assert_gpu_result_equal(query) +``` + +# Debugging + +If the callback execution fails during the polars `collect` call, we +obtain an error, but are not able to drop into the debugger and +inspect the stack properly: we can't cross the language barrier. + +However, we can drive the translation and execution of the DSL by +hand. Given some `LazyFrame` representing a query, we can first +translate it to our intermediate representation (IR), and then execute +and convert back to polars: + +```python +from cudf_polars.dsl.translate import translate_ir + +q = ... + +# Convert to our IR +ir = translate_ir(q._ldf.visit()) + +# DataFrame living on the device +result = ir.evaluate(cache={}) + +# Polars dataframe +host_result = result.to_polars() +``` + +If we get any exceptions, we can then debug as normal in Python. diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 645dbd26140..79018c80bf3 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -56,8 +56,8 @@ def test_agg(df, agg): q = df.select(expr) # https://github.com/rapidsai/cudf/issues/15852 - check_dtype = agg not in {"n_unique", "median"} - if not check_dtype and q.schema["a"] != pl.Float64: + check_dtypes = agg not in {"n_unique", "median"} + if not check_dtypes and q.schema["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) - assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index 503edef152e..037f3ab5428 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -36,3 +36,24 @@ def test_select_reduce(): ) assert_gpu_result_equal(query) + + +def test_select_with_cse_no_agg(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = pl.col("a") + pl.col("a") + + query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c")) + + assert_gpu_result_equal(query) + + +def test_select_with_cse_with_agg(): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = pl.col("a") + pl.col("a") + asum = pl.col("a").sum() + pl.col("a").sum() + + query = df.select( + expr, (expr * 2).alias("b"), asum.alias("c"), (asum + 10).alias("d") + ) + + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 2c85bb15a55..18cf4748692 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -2,14 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pytest - import polars as pl from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.mark.xfail(reason="Need handling of null scalars that are cast") def test_union(): ldf = pl.DataFrame( { @@ -19,8 +16,6 @@ def test_union(): ).lazy() ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) query = pl.concat([ldf, ldf2], how="diagonal") - # Plan for this produces a `None`.astype(Int64) which we don't - # handle correctly right now assert_gpu_result_equal(query) From 66895af970c19978e12c242f92f5b5676d91b9e3 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 6 Jun 2024 11:12:15 -0500 Subject: [PATCH 20/48] Implement chunked parquet reader in cudf-python (#15728) Partially Addresses: #14966 This PR implements chunked parquet bindings in python. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15728 --- python/cudf/cudf/_lib/parquet.pyx | 242 +++++++++++++----- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 12 + python/cudf/cudf/tests/test_parquet.py | 27 ++ 3 files changed, 220 insertions(+), 61 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index ac592cedaac..f6f9cfa9a7c 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -26,6 +26,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr +from libcpp.pair cimport pair from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move @@ -44,6 +45,7 @@ from cudf._lib.io.utils cimport ( ) from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( + chunked_parquet_reader as cpp_chunked_parquet_reader, chunked_parquet_writer_options, merge_row_group_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, @@ -60,6 +62,7 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport ( from cudf._lib.pylibcudf.libcudf.io.types cimport ( column_in_metadata, table_input_metadata, + table_metadata, ) from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type @@ -126,50 +129,22 @@ def _parse_metadata(meta): return file_is_range_index, file_index_cols, file_column_dtype -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - # Convert NativeFile buffers to NativeFileDatasource, - # but save original buffers in case we need to use - # pyarrow for metadata processing - # (See: https://github.com/rapidsai/cudf/issues/9599) - pa_buffers = [] - for i, datasource in enumerate(filepaths_or_buffers): - if isinstance(datasource, NativeFile): - pa_buffers.append(datasource) - filepaths_or_buffers[i] = NativeFileDatasource(datasource) +cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options( + cudf_io_types.source_info source, + vector[vector[size_type]] row_groups, + bool use_pandas_metadata, + Expression filters, + object columns): - cdef cudf_io_types.source_info source = make_source_info( - filepaths_or_buffers) - - cdef bool cpp_use_pandas_metadata = use_pandas_metadata - - cdef vector[vector[size_type]] cpp_row_groups + cdef parquet_reader_options args + cdef parquet_reader_options_builder builder cdef data_type cpp_timestamp_type = cudf_types.data_type( cudf_types.type_id.EMPTY ) - if row_groups is not None: - cpp_row_groups = row_groups - - # Setup parquet reader arguments - cdef parquet_reader_options args - cdef parquet_reader_options_builder builder builder = ( parquet_reader_options.builder(source) - .row_groups(cpp_row_groups) - .use_pandas_metadata(cpp_use_pandas_metadata) + .row_groups(row_groups) + .use_pandas_metadata(use_pandas_metadata) .use_arrow_schema(True) .timestamp_type(cpp_timestamp_type) ) @@ -185,28 +160,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for col in columns: cpp_columns.push_back(str(col).encode()) args.set_columns(cpp_columns) - # Filters don't handle the range index correctly allow_range_index &= filters is None - # Read Parquet - cdef cudf_io_types.table_with_metadata c_result - - with nogil: - c_result = move(parquet_reader(args)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - # Access the Parquet per_file_user_data to find the index + return pair[parquet_reader_options, bool](args, allow_range_index) + +cdef object _process_metadata(object df, + table_metadata table_meta, + list names, + object row_groups, + object filepaths_or_buffers, + list pa_buffers, + bool allow_range_index, + bool use_pandas_metadata): + update_struct_field_names(df, table_meta.schema_info) index_col = None - cdef vector[unordered_map[string, string]] per_file_user_data = \ - c_result.metadata.per_file_user_data - + is_range_index = True column_index_type = None index_col_names = None - is_range_index = True + meta = None + cdef vector[unordered_map[string, string]] per_file_user_data = \ + table_meta.per_file_user_data for single_file in per_file_user_data: json_str = single_file[b'pandas'].decode('utf-8') - meta = None if json_str != "": meta = json.loads(json_str) file_is_range_index, index_col, column_index_type = _parse_metadata(meta) @@ -220,13 +195,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if c['field_name'] == idx_col: index_col_names[idx_col] = c['name'] - df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_result.tbl), - column_names=names - )) - - update_struct_field_names(df, c_result.metadata.schema_info) - if meta is not None: # Book keep each column metadata as the order # of `meta["columns"]` and `column_names` are not @@ -319,9 +287,65 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if use_pandas_metadata: df.index.names = index_col - # Set column dtype for empty types. if len(df._data.names) == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) + + return df + + +cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, + use_pandas_metadata=True, + Expression filters=None): + """ + Cython function to call into libcudf API, see `read_parquet`. + + filters, if not None, should be an Expression that evaluates to a + boolean predicate as a function of columns being read. + + See Also + -------- + cudf.io.parquet.read_parquet + cudf.io.parquet.to_parquet + """ + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + pa_buffers = [] + for i, datasource in enumerate(filepaths_or_buffers): + if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) + filepaths_or_buffers[i] = NativeFileDatasource(datasource) + + cdef cudf_io_types.source_info source = make_source_info( + filepaths_or_buffers) + + cdef vector[vector[size_type]] cpp_row_groups + if row_groups is not None: + cpp_row_groups = row_groups + + # Setup parquet reader arguments + cdef parquet_reader_options args + cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options( + source, cpp_row_groups, use_pandas_metadata, filters, columns) + args, allow_range_index = c_res.first, c_res.second + + # Read Parquet + cdef cudf_io_types.table_with_metadata c_result + + with nogil: + c_result = move(parquet_reader(args)) + + names = [info.name.decode() for info in c_result.metadata.schema_info] + + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_result.tbl), + column_names=names + )) + df = _process_metadata(df, c_result.metadata, names, row_groups, + filepaths_or_buffers, pa_buffers, + allow_range_index, use_pandas_metadata) return df cpdef read_parquet_metadata(filepaths_or_buffers): @@ -767,6 +791,102 @@ cdef class ParquetWriter: self.initialized = True +cdef class ParquetReader: + cdef bool initialized + cdef unique_ptr[cpp_chunked_parquet_reader] reader + cdef size_t chunk_read_limit + cdef size_t pass_read_limit + cdef size_t row_group_size_bytes + cdef table_metadata result_meta + cdef vector[unordered_map[string, string]] per_file_user_data + cdef object pandas_meta + cdef list pa_buffers + cdef bool allow_range_index + cdef object row_groups + cdef object filepaths_or_buffers + cdef object names + cdef object column_index_type + cdef object index_col_names + cdef bool is_range_index + cdef object index_col + cdef bool cpp_use_pandas_metadata + + def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None, + use_pandas_metadata=True, + size_t chunk_read_limit=0, + size_t pass_read_limit=1024000000): + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + + pa_buffers = [] + for i, datasource in enumerate(filepaths_or_buffers): + if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) + filepaths_or_buffers[i] = NativeFileDatasource(datasource) + self.pa_buffers = pa_buffers + cdef cudf_io_types.source_info source = make_source_info( + filepaths_or_buffers) + + self.cpp_use_pandas_metadata = use_pandas_metadata + + cdef vector[vector[size_type]] cpp_row_groups + if row_groups is not None: + cpp_row_groups = row_groups + cdef parquet_reader_options args + cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options( + source, cpp_row_groups, use_pandas_metadata, None, columns) + args, self.allow_range_index = c_res.first, c_res.second + + with nogil: + self.reader.reset( + new cpp_chunked_parquet_reader( + chunk_read_limit, + pass_read_limit, + args + ) + ) + self.initialized = False + self.row_groups = row_groups + self.filepaths_or_buffers = filepaths_or_buffers + + def _has_next(self): + cdef bool res + with nogil: + res = self.reader.get()[0].has_next() + return res + + def _read_chunk(self): + # Read Parquet + cdef cudf_io_types.table_with_metadata c_result + + with nogil: + c_result = move(self.reader.get()[0].read_chunk()) + + if not self.initialized: + self.names = [info.name.decode() for info in c_result.metadata.schema_info] + self.result_meta = c_result.metadata + + df = cudf.DataFrame._from_data(*data_from_unique_ptr( + move(c_result.tbl), + column_names=self.names, + )) + + self.initialized = True + return df + + def read(self): + dfs = [] + while self._has_next(): + dfs.append(self._read_chunk()) + df = cudf.concat(dfs) + df = _process_metadata(df, self.result_meta, self.names, self.row_groups, + self.filepaths_or_buffers, self.pa_buffers, + self.allow_range_index, self.cpp_use_pandas_metadata) + return df + cpdef merge_filemetadata(object filemetadata_list): """ Cython function to call into libcudf API, see `merge_row_group_metadata`. diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index 33a594b432f..fb98650308a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -283,6 +283,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: vector[string] column_chunks_file_paths, ) except + + cdef cppclass chunked_parquet_reader: + chunked_parquet_reader() except + + chunked_parquet_reader( + size_t chunk_read_limit, + const parquet_reader_options& options) except + + chunked_parquet_reader( + size_t chunk_read_limit, + size_t pass_read_limit, + const parquet_reader_options& options) except + + bool has_next() except + + cudf_io_types.table_with_metadata read_chunk() except + + cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata( const vector[unique_ptr[vector[uint8_t]]]& metadata_list ) except + diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e32fdacd8d6..2596fe8cd37 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -22,6 +22,7 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf +from cudf._lib.parquet import ParquetReader from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -3407,3 +3408,29 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Check results assert_eq(expected, got) + + +@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("use_pandas_metadata", [True, False]) +@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]]) +def test_parquet_chunked_reader( + chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups +): + df = pd.DataFrame( + {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000} + ) + buffer = BytesIO() + df.to_parquet(buffer) + reader = ParquetReader( + [buffer], + chunk_read_limit=chunk_read_limit, + pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) + expected = cudf.read_parquet( + buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups + ) + actual = reader.read() + assert_eq(expected, actual) From 61da92415f1449f64a4050d2dec47b29344389a9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 17:19:28 +0100 Subject: [PATCH 21/48] Document how to use cudf.pandas in tandem with multiprocessing (#15940) We need to arrange that cudf.pandas.install() is run on the workers, this requires that we programmatically install the metapath loader in our script. Unfortunately, passing an initializer function to the pool startup is not sufficient if any part of the script transitively loads pandas at the top level. - Closes #15246 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15940 --- docs/cudf/source/cudf_pandas/usage.md | 30 +++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index b174c606d66..376784439aa 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -26,6 +26,36 @@ From the command line, run your Python scripts with `-m cudf.pandas`: python -m cudf.pandas script.py ``` +### Usage in tandem with +[`multiprocessing`](https://docs.python.org/3/library/multiprocessing.html) +or +[`concurrent.futures`](https://docs.python.org/3/library/concurrent.futures.html) +process pools + +To use a pool of workers (for example +[`multiprocessing.Pool`](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool) +or +[`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)) +in your script with `cudf.pandas`, the `cudf.pandas` module must be +loaded on the worker processes, as well as by the controlling script. +The most foolproof way to do this is to programmatically install +`cudf.pandas` at the top of your script, before anything else. +For example + +```python +# This is equivalent to python -m cudf.pandas, but will run on the +# workers too. These two lines must run before pandas is imported, +# either directly or transitively. +import cudf.pandas +cudf.pandas.install() + +from multiprocessing import Pool + +with Pool(4) as pool: + # use pool here + ... +``` + ## Understanding performance - the `cudf.pandas` profiler `cudf.pandas` will attempt to use the GPU whenever possible and fall From 3468fa1f5b9dfcf83a95bcb09fe5a4d8d3808620 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 6 Jun 2024 19:30:48 +0100 Subject: [PATCH 22/48] Add more complete type annotations in polars interpreter (#15942) We can check this with: pyright --verifytypes cudf_polars --ignoreexternal Which reports a "type completeness" score of around 94%. This will improve once pylibcudf gets type stubs. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - James Lamb (https://github.com/jameslamb) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15942 --- .pre-commit-config.yaml | 2 +- python/cudf_polars/cudf_polars/__init__.py | 5 +- python/cudf_polars/cudf_polars/callback.py | 3 +- .../cudf_polars/containers/dataframe.py | 13 +- python/cudf_polars/cudf_polars/dsl/expr.py | 55 +++++--- python/cudf_polars/cudf_polars/dsl/ir.py | 110 +++++++-------- .../cudf_polars/cudf_polars/dsl/translate.py | 127 ++++++++++++------ python/cudf_polars/cudf_polars/py.typed | 0 .../cudf_polars/testing/asserts.py | 2 +- .../cudf_polars/typing/__init__.py | 91 +++++++++++++ python/cudf_polars/pyproject.toml | 2 - 11 files changed, 287 insertions(+), 123 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/py.typed create mode 100644 python/cudf_polars/cudf_polars/typing/__init__.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8865fb48e0d..4cdcac88091 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -134,7 +134,7 @@ repos: - id: rapids-dependency-file-generator args: ["--clean"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.8 hooks: - id: ruff files: python/.*$ diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 74547fe2448..b19a282129a 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,4 +10,7 @@ from __future__ import annotations -__all__: list[str] = [] +from cudf_polars.callback import execute_with_cudf +from cudf_polars.dsl.translate import translate_ir + +__all__: list[str] = ["execute_with_cudf", "translate_ir"] diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index aabb8498ce2..979087d5273 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -16,6 +16,7 @@ import polars as pl from cudf_polars.dsl.ir import IR + from cudf_polars.typing import NodeTraverser __all__: list[str] = ["execute_with_cudf"] @@ -33,7 +34,7 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: """ A post optimization callback that attempts to execute the plan with cudf. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index eeaf181be0c..ac7e748095e 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -6,7 +6,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import polars as pl @@ -17,6 +17,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence, Set + import pyarrow as pa from typing_extensions import Self import cudf @@ -44,13 +45,13 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" - return pl.from_arrow( - plc.interop.to_arrow( - self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], - ) + table: pa.Table = plc.interop.to_arrow( + self.table, + [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], ) + return cast(pl.DataFrame, pl.from_arrow(table)) + @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c7c11cf6c68..6d9435ce373 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -139,14 +139,14 @@ def is_equal(self, other: Any) -> bool: other.children ) - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: """Equality of expressions.""" if type(self) != type(other) or hash(self) != hash(other): return False else: return self.is_equal(other) - def __ne__(self, other) -> bool: + def __ne__(self, other: Any) -> bool: """Inequality of expressions.""" return not self.__eq__(other) @@ -285,6 +285,8 @@ class NamedExpr: # when evaluating expressions themselves, only when constructing # named return values in dataframe (IR) nodes. __slots__ = ("name", "value") + value: Expr + name: str def __init__(self, name: str, value: Expr) -> None: self.name = name @@ -298,7 +300,7 @@ def __repr__(self) -> str: """Repr of the expression.""" return f"NamedExpr({self.name}, {self.value}" - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: """Equality of two expressions.""" return ( type(self) is type(other) @@ -306,7 +308,7 @@ def __eq__(self, other) -> bool: and self.value == other.value ) - def __ne__(self, other) -> bool: + def __ne__(self, other: Any) -> bool: """Inequality of expressions.""" return not self.__eq__(other) @@ -344,9 +346,10 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Literal(Expr): __slots__ = ("value",) _non_child = ("dtype", "value") - value: pa.Scalar + value: pa.Scalar[Any] + children: tuple[()] - def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None: + def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: super().__init__(dtype) assert value.type == plc.interop.to_arrow(dtype) self.value = value @@ -367,6 +370,7 @@ class Col(Expr): __slots__ = ("name",) _non_child = ("dtype", "name") name: str + children: tuple[()] def __init__(self, dtype: plc.DataType, name: str) -> None: self.dtype = dtype @@ -388,6 +392,8 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Len(Expr): + children: tuple[()] + def do_evaluate( self, df: DataFrame, @@ -410,8 +416,15 @@ def collect_agg(self, *, depth: int) -> AggInfo: class BooleanFunction(Expr): __slots__ = ("name", "options", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] - def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr): + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.BooleanFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: super().__init__(dtype) self.options = options self.name = name @@ -610,14 +623,15 @@ def do_evaluate( class StringFunction(Expr): __slots__ = ("name", "options", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] def __init__( self, dtype: plc.DataType, name: pl_expr.StringFunction, - options: tuple, + options: tuple[Any, ...], *children: Expr, - ): + ) -> None: super().__init__(dtype) self.options = options self.name = name @@ -661,10 +675,11 @@ def do_evaluate( class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr] def __init__( self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr - ): + ) -> None: super().__init__(dtype) self.options = options self.children = (column,) @@ -696,6 +711,7 @@ def do_evaluate( class SortBy(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr, ...] def __init__( self, @@ -703,7 +719,7 @@ def __init__( options: tuple[bool, tuple[bool], tuple[bool]], column: Expr, *by: Expr, - ): + ) -> None: super().__init__(dtype) self.options = options self.children = (column, *by) @@ -734,8 +750,9 @@ def do_evaluate( class Gather(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr, Expr] - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: super().__init__(dtype) self.children = (values, indices) @@ -775,6 +792,7 @@ def do_evaluate( class Filter(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr, Expr] def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): super().__init__(dtype) @@ -801,8 +819,9 @@ def do_evaluate( class RollingWindow(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr] - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: super().__init__(dtype) self.options = options self.children = (agg,) @@ -811,8 +830,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): class GroupedRollingWindow(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") + children: tuple[Expr, ...] - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: super().__init__(dtype) self.options = options self.children = (agg, *by) @@ -821,8 +841,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): class Cast(Expr): __slots__ = ("children",) _non_child = ("dtype",) + children: tuple[Expr] - def __init__(self, dtype: plc.DataType, value: Expr): + def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) @@ -848,6 +869,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Agg(Expr): __slots__ = ("name", "options", "op", "request", "children") _non_child = ("dtype", "name", "options") + children: tuple[Expr] def __init__( self, dtype: plc.DataType, name: str, options: Any, value: Expr @@ -1007,7 +1029,7 @@ def _last(self, column: Column) -> Column: def do_evaluate( self, - df, + df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, @@ -1022,6 +1044,7 @@ def do_evaluate( class BinOp(Expr): __slots__ = ("op", "children") _non_child = ("dtype", "op") + children: tuple[Expr, Expr] def __init__( self, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 0a72cbd9f83..665bbe5be41 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1,7 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 -# TODO: remove need for this -# ruff: noqa: D101 """ DSL nodes for the LogicalPlan of polars. @@ -15,11 +13,11 @@ from __future__ import annotations +import dataclasses import itertools import types -from dataclasses import dataclass from functools import cache -from typing import TYPE_CHECKING, Any, Callable, ClassVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, NoReturn import pyarrow as pa from typing_extensions import assert_never @@ -34,8 +32,11 @@ from cudf_polars.utils import sorting if TYPE_CHECKING: + from collections.abc import MutableMapping from typing import Literal + from cudf_polars.typing import Schema + __all__ = [ "IR", @@ -91,14 +92,14 @@ def broadcast( ] -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class IR: """Abstract plan node, representing an unevaluated dataframe.""" - schema: dict[str, plc.DataType] + schema: Schema """Mapping from column names to their data types.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ Evaluate the node and return a dataframe. @@ -123,7 +124,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: raise NotImplementedError -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class PythonScan(IR): """Representation of input from a python function.""" @@ -133,7 +134,7 @@ class PythonScan(IR): """Filter to apply to the constructed dataframe before returning it.""" -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Scan(IR): """Input from files.""" @@ -153,14 +154,14 @@ class Scan(IR): predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options with_columns = options.with_columns @@ -172,9 +173,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) ) elif self.typ == "parquet": - df = DataFrame.from_cudf( - cudf.read_parquet(self.paths, columns=with_columns) - ) + cdf = cudf.read_parquet(self.paths, columns=with_columns) + assert isinstance(cdf, cudf.DataFrame) + df = DataFrame.from_cudf(cdf) else: assert_never(self.typ) if row_index is not None: @@ -208,7 +209,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df.filter(mask) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Cache(IR): """ Return a cached plan node. @@ -221,7 +222,7 @@ class Cache(IR): value: IR """The unevaluated node to cache.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" try: return cache[self.key] @@ -229,7 +230,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return cache.setdefault(self.key, self.value.evaluate(cache=cache)) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class DataFrameScan(IR): """ Input from an existing polars DataFrame. @@ -244,7 +245,7 @@ class DataFrameScan(IR): predicate: expr.NamedExpr | None """Mask to apply.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: @@ -270,7 +271,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Select(IR): """Produce a new dataframe selecting given expressions from an input.""" @@ -279,7 +280,7 @@ class Select(IR): expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" - def evaluate(self, *, cache: dict[int, DataFrame]): + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # Handle any broadcasting @@ -287,7 +288,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]): return DataFrame(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Reduce(IR): """ Produce a new dataframe selecting given expressions from an input. @@ -300,7 +301,7 @@ class Reduce(IR): expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" - def evaluate(self, *, cache: dict[int, DataFrame]): + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = broadcast(*(e.evaluate(df) for e in self.expr)) @@ -308,7 +309,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]): return DataFrame(columns) -def placeholder_column(n: int): +def placeholder_column(n: int) -> plc.Column: """ Produce a placeholder pylibcudf column with NO BACKING DATA. @@ -338,7 +339,7 @@ def placeholder_column(n: int): ) -@dataclass(slots=False) +@dataclasses.dataclass(slots=False) class GroupBy(IR): """Perform a groupby.""" @@ -352,6 +353,7 @@ class GroupBy(IR): """Should the order of the input dataframe be maintained?""" options: Any """Options controlling style of groupby.""" + agg_infos: list[expr.AggInfo] = dataclasses.field(init=False) @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -383,7 +385,7 @@ def check_agg(agg: expr.Expr) -> int: else: raise NotImplementedError(f"No handler for {agg=}") - def __post_init__(self): + def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" if self.options.rolling is None and self.maintain_order: raise NotImplementedError("Maintaining order in groupby") @@ -393,7 +395,7 @@ def __post_init__(self): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) keys = broadcast( @@ -438,7 +440,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame([*result_keys, *results]).slice(self.options.slice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Join(IR): """A join of two dataframes.""" @@ -466,7 +468,7 @@ class Join(IR): - coalesce: should key columns be coalesced (only makes sense for outer joins) """ - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.options[0] == "cross": raise NotImplementedError("cross join not implemented") @@ -511,7 +513,7 @@ def _joiners( else: assert_never(how) - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) right = self.right.evaluate(cache=cache) @@ -577,7 +579,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return result.slice(zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class HStack(IR): """Add new columns to a dataframe.""" @@ -586,7 +588,7 @@ class HStack(IR): columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = [c.evaluate(df) for c in self.columns] @@ -597,7 +599,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return df.with_columns(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Distinct(IR): """Produce a new dataframe with distinct rows.""" @@ -619,7 +621,7 @@ class Distinct(IR): "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, } - def __init__(self, schema: dict, df: IR, options: Any): + def __init__(self, schema: Schema, df: IR, options: Any) -> None: self.schema = schema self.df = df (keep, subset, maintain_order, zlice) = options @@ -628,7 +630,7 @@ def __init__(self, schema: dict, df: IR, options: Any): self.stable = maintain_order self.zlice = zlice - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) if self.subset is None: @@ -667,7 +669,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return result.slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Sort(IR): """Sort a dataframe.""" @@ -686,12 +688,12 @@ class Sort(IR): def __init__( self, - schema: dict, + schema: Schema, df: IR, by: list[expr.NamedExpr], options: Any, zlice: tuple[int, int] | None, - ): + ) -> None: self.schema = schema self.df = df self.by = by @@ -704,7 +706,7 @@ def __init__( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) sort_keys = broadcast( @@ -736,7 +738,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame(columns).slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Slice(IR): """Slice a dataframe.""" @@ -747,13 +749,13 @@ class Slice(IR): length: int """Length of the slice.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) return df.slice((self.offset, self.length)) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Filter(IR): """Filter a dataframe with a boolean mask.""" @@ -762,21 +764,21 @@ class Filter(IR): mask: expr.NamedExpr """Expression evaluating to a mask.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) return df.filter(mask) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Projection(IR): """Select a subset of columns from a dataframe.""" df: IR """Input.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # This can reorder things. @@ -786,7 +788,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame(columns) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class MapFunction(IR): """Apply some function to a dataframe.""" @@ -807,7 +809,7 @@ class MapFunction(IR): ] ) - def __post_init__(self): + def __post_init__(self) -> None: """Validate preconditions.""" if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") @@ -824,7 +826,7 @@ def __post_init__(self): if key_column not in self.df.dfs[0].schema: raise ValueError(f"Key column {key_column} not found") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" if self.name == "merge_sorted": # merge_sorted operates on Union inputs @@ -876,7 +878,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: raise AssertionError("Should never be reached") -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class Union(IR): """Concatenate dataframes vertically.""" @@ -885,13 +887,13 @@ class Union(IR): zlice: tuple[int, int] | None """Optional slice to apply after concatenation.""" - def __post_init__(self): + def __post_init__(self) -> None: """Validated preconditions.""" schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise ValueError("Schema mismatch") - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" # TODO: only evaluate what we need if we have a slice dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -900,14 +902,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ).slice(self.zlice) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class HConcat(IR): """Concatenate dataframes horizontally.""" dfs: list[IR] """List of inputs.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] return DataFrame( @@ -915,7 +917,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) -@dataclass(slots=True) +@dataclasses.dataclass(slots=True) class ExtContext(IR): """ Concatenate dataframes horizontally. @@ -928,7 +930,7 @@ class ExtContext(IR): extra: list[IR] """List of extra inputs.""" - def __post_init__(self): + def __post_init__(self) -> NoReturn: """Validate preconditions.""" raise NotImplementedError( "ExtContext will be deprecated, use horizontal concat instead." diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 641176daff4..38107023365 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -16,12 +16,13 @@ import cudf._lib.pylibcudf as plc from cudf_polars.dsl import expr, ir +from cudf_polars.typing import NodeTraverser from cudf_polars.utils import dtypes __all__ = ["translate_ir", "translate_named_expr"] -class set_node(AbstractContextManager): +class set_node(AbstractContextManager[None]): """ Run a block with current node set in the visitor. @@ -39,30 +40,36 @@ class set_node(AbstractContextManager): """ __slots__ = ("n", "visitor") + visitor: NodeTraverser + n: int - def __init__(self, visitor, n: int): + def __init__(self, visitor: NodeTraverser, n: int) -> None: self.visitor = visitor self.n = n - def __enter__(self): + def __enter__(self) -> None: n = self.visitor.get_node() self.visitor.set_node(self.n) self.n = n - def __exit__(self, *args): + def __exit__(self, *args: Any) -> None: self.visitor.set_node(self.n) -noop_context: nullcontext = nullcontext() +noop_context: nullcontext[None] = nullcontext() @singledispatch -def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _translate_ir( + node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: raise NotImplementedError(f"Translation for {type(node).__name__}") @_translate_ir.register -def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.PythonScan( schema, node.options, @@ -73,7 +80,9 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> @_translate_ir.register -def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Scan( schema, node.scan_type, @@ -86,13 +95,15 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) @_translate_ir.register def _( - node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType] + node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.DataFrameScan( schema, @@ -105,7 +116,9 @@ def _( @_translate_ir.register -def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] @@ -113,7 +126,9 @@ def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) aggs = [translate_named_expr(visitor, n=e) for e in node.aggs] @@ -129,7 +144,9 @@ def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir. @_translate_ir.register -def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. @@ -143,7 +160,9 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] @@ -151,7 +170,9 @@ def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] @@ -159,7 +180,9 @@ def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register -def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Distinct( schema, translate_ir(visitor, n=node.input), @@ -168,7 +191,9 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir @_translate_ir.register -def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) by = [translate_named_expr(visitor, n=e) for e in node.by_column] @@ -176,12 +201,16 @@ def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) @_translate_ir.register -def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) mask = translate_named_expr(visitor, n=node.predicate) @@ -190,13 +219,17 @@ def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I @_translate_ir.register def _( - node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType] + node: pl_ir.SimpleProjection, + visitor: NodeTraverser, + schema: dict[str, plc.DataType], ) -> ir.IR: return ir.Projection(schema, translate_ir(visitor, n=node.input)) @_translate_ir.register -def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: name, *options = node.function return ir.MapFunction( schema, @@ -208,19 +241,25 @@ def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> @_translate_ir.register -def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.Union( schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options ) @_translate_ir.register -def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) @_translate_ir.register -def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: +def _( + node: pl_ir.ExtContext, visitor: NodeTraverser, schema: dict[str, plc.DataType] +) -> ir.IR: return ir.ExtContext( schema, translate_ir(visitor, n=node.input), @@ -228,7 +267,7 @@ def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ) -def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: +def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: """ Translate a polars-internal IR node to our representation. @@ -249,7 +288,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: NotImplementedError If we can't translate the nodes due to unsupported functionality. """ - ctx: AbstractContextManager = ( + ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) with ctx: @@ -258,7 +297,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: return _translate_ir(node, visitor, schema) -def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr: +def translate_named_expr( + visitor: NodeTraverser, *, n: pl_expr.PyExprIR +) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. @@ -289,12 +330,14 @@ def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr @singledispatch -def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _translate_expr( + node: Any, visitor: NodeTraverser, dtype: plc.DataType +) -> expr.Expr: raise NotImplementedError(f"Translation for {type(node).__name__}") @_translate_expr.register -def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): @@ -316,7 +359,7 @@ def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby? if node.partition_by is None: return expr.RollingWindow( @@ -332,19 +375,19 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) return expr.Literal(dtype, value) @_translate_expr.register -def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: # TODO: raise in groupby return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) @_translate_expr.register -def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.SortBy( dtype, node.sort_options, @@ -354,7 +397,7 @@ def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Gather( dtype, translate_expr(visitor, n=node.expr), @@ -363,7 +406,7 @@ def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Filter( dtype, translate_expr(visitor, n=node.input), @@ -372,7 +415,7 @@ def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): @@ -382,12 +425,12 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register -def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Agg( dtype, node.name, @@ -397,7 +440,9 @@ def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _( + node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType +) -> expr.Expr: return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], @@ -407,11 +452,11 @@ def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register -def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: +def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: return expr.Len(dtype) -def translate_expr(visitor: Any, *, n: int) -> expr.Expr: +def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. diff --git a/python/cudf_polars/cudf_polars/py.typed b/python/cudf_polars/cudf_polars/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 2fbfa971fef..2f19b41cc3a 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -28,7 +28,7 @@ def assert_gpu_result_equal( rtol: float = 1e-05, atol: float = 1e-08, categorical_as_str: bool = False, -): +) -> None: """ Assert that collection of a lazyframe on GPU produces correct results. diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py new file mode 100644 index 00000000000..287c977f4eb --- /dev/null +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Typing utilities for cudf_polars.""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Protocol, TypeAlias + +from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir + +import cudf._lib.pylibcudf as plc + +if TYPE_CHECKING: + from typing import Callable + + import polars as pl + +IR: TypeAlias = ( + pl_ir.PythonScan + | pl_ir.Scan + | pl_ir.Cache + | pl_ir.DataFrameScan + | pl_ir.Select + | pl_ir.GroupBy + | pl_ir.Join + | pl_ir.HStack + | pl_ir.Distinct + | pl_ir.Sort + | pl_ir.Slice + | pl_ir.Filter + | pl_ir.SimpleProjection + | pl_ir.MapFunction + | pl_ir.Union + | pl_ir.HConcat + | pl_ir.ExtContext +) + +Expr: TypeAlias = ( + pl_expr.Function + | pl_expr.Window + | pl_expr.Literal + | pl_expr.Sort + | pl_expr.SortBy + | pl_expr.Gather + | pl_expr.Filter + | pl_expr.Cast + | pl_expr.Column + | pl_expr.Agg + | pl_expr.BinaryExpr + | pl_expr.Len + | pl_expr.PyExprIR +) + +Schema: TypeAlias = Mapping[str, plc.DataType] + + +class NodeTraverser(Protocol): + """Abstract protocol for polars NodeTraverser.""" + + def get_node(self) -> int: + """Return current plan node id.""" + ... + + def set_node(self, n: int) -> None: + """Set the current plan node to n.""" + ... + + def view_current_node(self) -> IR: + """Convert current plan node to python rep.""" + ... + + def get_schema(self) -> Mapping[str, pl.DataType]: + """Get the schema of the current plan node.""" + ... + + def get_dtype(self, n: int) -> pl.DataType: + """Get the datatype of the given expression id.""" + ... + + def view_expression(self, n: int) -> Expr: + """Convert the given expression to python rep.""" + ... + + def set_udf( + self, + callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], + ) -> None: + """Set the callback replacing the current node in the plan.""" + ... diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index e50ee76a9b9..2faf8c3193f 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -62,8 +62,6 @@ target-version = "py39" fix = true [tool.ruff.lint] -# __init__.py must re-export everything it imports -ignore-init-module-imports = false select = [ "E", # pycodestyle "W", # pycodestyle From 5f45803b2a68b49d330d94e2f701791a7590612a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:00:12 -0700 Subject: [PATCH 23/48] Migrate quantile.pxd to pylibcudf (#15874) xref #15162 Migrate quantile.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15874 --- cpp/src/quantiles/quantiles.cu | 4 +- cpp/tests/quantiles/quantiles_test.cpp | 9 +- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/quantiles.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 25 ++ python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 152 ++++++++++++ python/cudf/cudf/_lib/quantiles.pyx | 102 ++------ python/cudf/cudf/pylibcudf_tests/conftest.py | 29 +++ .../cudf/pylibcudf_tests/test_quantiles.py | 234 ++++++++++++++++++ 12 files changed, 486 insertions(+), 81 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_quantiles.py diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index c0f536536ce..af3bda2e62e 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -34,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -78,7 +79,8 @@ std::unique_ptr quantiles(table_view const& input, CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER || interp == interpolation::NEAREST, - "multi-column quantiles require a non-arithmetic interpolation strategy."); + "multi-column quantiles require a non-arithmetic interpolation strategy.", + std::invalid_argument); CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row."); diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 5b7b6dd2718..b7faa20e8c1 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include +#include + template struct QuantilesTest : public cudf::test::BaseFixture {}; @@ -104,9 +106,10 @@ TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation) cudf::test::fixed_width_column_wrapper input_b({}); auto input = cudf::table_view({input_a}); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), std::invalid_argument); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), + std::invalid_argument); } TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 870ed8856d1..1e03fa80bb5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. join lists merge + quantiles reduce reshape rolling diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst new file mode 100644 index 00000000000..3417c1ff59d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst @@ -0,0 +1,6 @@ +========= +quantiles +========= + +.. automodule:: cudf._lib.pylibcudf.quantiles + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 6beb7b0f506..ed396208f98 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -26,6 +26,7 @@ set(cython_sources join.pyx lists.pyx merge.pyx + quantiles.pyx reduce.pyx replace.pyx reshape.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index b289d112a90..a628ecdb038 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -12,6 +12,7 @@ from . cimport ( join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ __all__ = [ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 2565332f3ed..46d0fe13cd1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -12,6 +12,7 @@ join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd new file mode 100644 index 00000000000..70ff135ca77 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted + +from .column cimport Column +from .table cimport Table + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = *, + Column ordered_indices = *, + bint exact = * +) + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = *, + sorted is_input_sorted = *, + list column_order = *, + list null_precedence = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx new file mode 100644 index 00000000000..c1f0e30ccd3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.quantiles cimport ( + quantile as cpp_quantile, + quantiles as cpp_quantiles, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted + +from .column cimport Column +from .table cimport Table +from .types cimport interpolation + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = interpolation.LINEAR, + Column ordered_indices = None, + bool exact=True +): + """Computes quantiles with interpolation. + + Computes the specified quantiles by interpolating values between which they lie, + using the interpolation strategy specified in interp. + + Parameters + ---------- + input: Column + The Column to calculate quantiles on. + q: array-like that implements buffer-protocol + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.LINEAR + The strategy used to select between values adjacent to a specified quantile. + ordered_indices: Column, default empty column + The column containing the sorted order of input. + + If empty, all input values are used in existing order. + Indices must be in range [0, input.size()), but are not required to be unique. + Values not indexed by this column will be ignored. + exact: bool, default True + Returns doubles if True. Otherwise, returns same type as input + + For details, see :cpp:func:`quantile`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[column] c_result + column_view ordered_indices_view + + if ordered_indices is None: + ordered_indices_view = column_view() + else: + ordered_indices_view = ordered_indices.view() + + with nogil: + c_result = move( + cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = interpolation.NEAREST, + sorted is_input_sorted = sorted.NO, + list column_order = None, + list null_precedence = None, +): + """Computes row quantiles with interpolation. + + Computes the specified quantiles by retrieving the row corresponding to the + specified quantiles. In the event a quantile lies in between rows, the specified + interpolation strategy is used to pick between the rows. + + Parameters + ---------- + input: Table + The Table to calculate row quantiles on. + q: array-like + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.NEAREST + The strategy used to select between values adjacent to a specified quantile. + + Must be a non-arithmetic interpolation strategy + (i.e. one of + {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`}) + is_input_sorted: Sorted, default Sorted.NO + Whether the input table has been pre-sorted or not. + column_order: list, default None + A list of `Order` enums, + indicating the desired sort order for each column. + By default, will sort all columns so that they are in ascending order. + + Ignored if `is_input_sorted` is `Sorted.YES` + null_precedence: list, default None + A list of `NullOrder` enums, + indicating how nulls should be sorted. + By default, will sort all columns so that nulls appear before + all other elements. + + Ignored if `is_input_sorted` is `Sorted.YES` + + For details, see :cpp:func:`quantiles`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[table] c_result + vector[order] column_order_vec + vector[null_order] null_precedence_vec + + if column_order is not None: + column_order_vec = column_order + if null_precedence is not None: + null_precedence_vec = null_precedence + + with nogil: + c_result = move( + cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 3d20454a7ce..7b50c00919a 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,76 +3,43 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, - underlying_type_t_null_order, - underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.quantiles cimport ( - quantile as cpp_quantile, - quantiles as cpp_quantile_table, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.pylibcudf.libcudf.types cimport ( - interpolation, - null_order, - order, - sorted, -) -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def quantile( Column input, - object q, + vector[double] q, str interp, Column ordered_indices, bool exact, - ): - cdef column_view c_input = input.view() - cdef column_view c_ordered_indices = ( - column_view() if ordered_indices is None - else ordered_indices.view() - ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) - cdef bool c_exact = exact - - cdef vector[double] c_q - c_q.reserve(len(q)) - - for value in q: - c_q.push_back(value) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_quantile( - c_input, - c_q, - c_interp, - c_ordered_indices, - c_exact, - ) + return Column.from_pylibcudf( + plc.quantiles.quantile( + input.to_pylibcudf(mode="read"), + q, + c_interp, + ordered_indices.to_pylibcudf(mode="read"), + exact ) - - return Column.from_unique_ptr(move(c_result)) + ) def quantile_table( @@ -83,42 +50,23 @@ def quantile_table( list column_order, list null_precedence, ): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef vector[double] c_q = q + cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) - cdef vector[order] c_column_order - cdef vector[null_order] c_null_precedence - - c_column_order.reserve(len(column_order)) - c_null_precedence.reserve(len(null_precedence)) - - for value in column_order: - c_column_order.push_back( - ( value) - ) - for value in null_precedence: - c_null_precedence.push_back( - ( value) + return columns_from_pylibcudf_table( + plc.quantiles.quantiles( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + q, + c_interp, + c_is_input_sorted, + column_order, + null_precedence ) - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_quantile_table( - c_input, - c_q, - c_interp, - c_is_input_sorted, - c_column_order, - c_null_precedence, - ) - ) - - return columns_from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 6d8284fb3db..f3c6584ef8c 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -7,6 +7,8 @@ import pyarrow as pa import pytest +import cudf._lib.pylibcudf as plc + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) from utils import DEFAULT_STRUCT_TESTING_TYPE @@ -29,3 +31,30 @@ ) def pa_type(request): return request.param + + +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.uint64(), + ], +) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="session", params=[opt for opt in plc.types.Interpolation] +) +def interp_opt(request): + return request.param + + +@pytest.fixture( + scope="session", + params=[opt for opt in plc.types.Sorted], +) +def sorted_opt(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py new file mode 100644 index 00000000000..a5d332a7795 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -0,0 +1,234 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import cudf._lib.pylibcudf as plc + +# Map pylibcudf interpolation options to pyarrow options +interp_mapping = { + plc.types.Interpolation.LINEAR: "linear", + plc.types.Interpolation.LOWER: "lower", + plc.types.Interpolation.HIGHER: "higher", + plc.types.Interpolation.MIDPOINT: "midpoint", + plc.types.Interpolation.NEAREST: "nearest", +} + + +@pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) +def pa_col_data(request, numeric_pa_type): + return pa.array(request.param, type=numeric_pa_type) + + +@pytest.fixture(scope="module") +def plc_col_data(pa_col_data): + return plc.interop.from_arrow(pa_col_data) + + +@pytest.fixture( + scope="module", + params=[ + { + "arrays": [[1, 2, 3, 5, 4], [5.0, 6.0, 8.0, 7.0, 9.0]], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.int64()), + ] + ), + }, + { + "arrays": [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2.0, 2.2, 2.3, 2.4, None, None, 3.5, 4.5, 5.5], + ], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.float64()), + ] + ), + }, + ], +) +def plc_tbl_data(request): + return plc.interop.from_arrow(pa.Table.from_arrays(**request.param)) + + +@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) +@pytest.mark.parametrize("exact", [True, False]) +def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, q, interp_opt, ordered_indices, exact + ) + + pa_interp_opt = interp_mapping[interp_opt] + + if exact: + pa_col_data = pc.cast(pa_col_data, pa.float64()) + + if len(q) > 0: + # pyarrow quantile doesn't support empty q + exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + else: + exp = pa.array([], type=pa.float64()) + + if not exact: + exp = pc.cast(exp, pa_col_data.type, safe=False) + + assert_column_eq(exp, res) + + +def _pyarrow_quantiles( + pa_tbl_data, + q, + interp_opt=plc.types.Interpolation.NEAREST, + sorted_opt=plc.types.Sorted.NO, + column_order=None, + null_precedence=None, +): + """ + The pyarrow equivalent of plc.quantiles.quantiles + + Takes the same arguments (except input should be a pyarrow table instead of + of a pylibcudf table) + + NOTE: This function doesn't support having different null precedences because of + a lack of support in pyarrow. + """ + if len(q) > 0: + # pyarrow quantile doesn't support empty q + pa_interp_opt = interp_mapping[interp_opt] + + if sorted_opt == plc.types.Sorted.NO: + order_mapper = { + plc.types.Order.ASCENDING: "ascending", + plc.types.Order.DESCENDING: "descending", + } + if null_precedence is None: + null_precedence = [plc.types.NullOrder.BEFORE] * len( + pa_tbl_data.columns + ) + if column_order is None: + column_order = [plc.types.Order.ASCENDING] * len( + pa_tbl_data.columns + ) + + if not all( + [ + null_prec == null_precedence[0] + for null_prec in null_precedence + ] + ): + raise NotImplementedError( + "Having varying null precendences is not implemented!" + ) + + pa_tbl_data = pa_tbl_data.sort_by( + [ + (name, order_mapper[order]) + for name, order in zip( + pa_tbl_data.column_names, column_order + ) + ], + null_placement="at_start" + if null_precedence[0] == plc.types.NullOrder.BEFORE + else "at_end", + ) + row_idxs = pc.quantile( + np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt + ) + exp = pa_tbl_data.take(row_idxs) + else: + exp = pa.Table.from_arrays( + [[] for _ in range(len(pa_tbl_data.schema))], + schema=pa_tbl_data.schema, + ) + return exp + + +@pytest.mark.parametrize( + "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] +) +@pytest.mark.parametrize( + "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] +) +@pytest.mark.parametrize( + "null_precedence", + [ + [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ], +) +def test_quantiles( + plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +): + if interp_opt in { + plc.types.Interpolation.LINEAR, + plc.types.Interpolation.MIDPOINT, + }: + pytest.skip( + "interp cannot be an arithmetic interpolation strategy for quantiles" + ) + + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + + exp = _pyarrow_quantiles( + pa_tbl_data, + q=q, + interp_opt=interp_opt, + sorted_opt=sorted_opt, + column_order=column_order, + null_precedence=null_precedence, + ) + + res = plc.quantiles.quantiles( + plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence + ) + + assert_table_eq(exp, res) + + +@pytest.mark.parametrize( + "invalid_interp", + [plc.types.Interpolation.LINEAR, plc.types.Interpolation.MIDPOINT], +) +def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): + with pytest.raises(ValueError): + plc.quantiles.quantiles( + plc_tbl_data, q=np.array([0.1]), interp=invalid_interp + ) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantile_q_array_like(pa_col_data, plc_col_data, q): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, + q=q, + ordered_indices=ordered_indices, + ) + exp = pc.quantile(pa_col_data, q=q) + assert_column_eq(exp, res) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantiles_q_array_like(plc_tbl_data, q): + res = plc.quantiles.quantiles(plc_tbl_data, q=q) + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + exp = _pyarrow_quantiles(pa_tbl_data, q=q) + assert_table_eq(exp, res) From d4dd474f0db6047b2404c2c98b86cf4446445e1b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:52:50 -0400 Subject: [PATCH 24/48] Use offsetalator in cudf::io::json::detail::parse_string (#15900) Updates the `cudf::io::json::detail::parse_string` function to use the offsetalator for building a strings column instead of `size_type` pointers. The output row sizes are computed in the first pass through the kernels and then converted to offsets. The offsets are wrapped with an offsetalator on the 2nd pass to locate each individual rows' output position in the chars data. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15900 --- cpp/src/io/utilities/data_casting.cu | 56 ++++++++++++++++------------ cpp/tests/io/json_test.cpp | 1 - 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 60cbfbc0dae..288a5690282 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -417,6 +418,7 @@ struct bitfield_block { * @param null_mask Null mask * @param null_count_data pointer to store null count * @param options Settings for controlling string processing behavior + * @param d_sizes Output size of each row * @param d_offsets Offsets to identify where to store the results for each string * @param d_chars Character array to store the characters of strings */ @@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, bitmask_type* null_mask, size_type* null_count_data, cudf::io::parse_options_view const options, - size_type* d_offsets, + size_type* d_sizes, + cudf::detail::input_offsetalator d_offsets, char* d_chars) { constexpr auto BLOCK_SIZE = @@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { - if (!d_chars && lane == 0) d_offsets[istring] = 0; + if (!d_chars && lane == 0) { d_sizes[istring] = 0; } continue; // gride-stride return; } @@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (lane == 0) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[istring] = 0; + if (!d_chars) { d_sizes[istring] = 0; } } continue; // gride-stride return; } @@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, // Copy literal/numeric value if (not is_string_value) { if (!d_chars) { - if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + if (lane == 0) { d_sizes[istring] = in_end - in_begin; } } else { for (thread_index_type char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) { @@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); } - last_offset = 0; - d_offsets[istring] = 0; + last_offset = 0; + d_sizes[istring] = 0; } if constexpr (!is_warp) { __syncthreads(); } break; // gride-stride return; @@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, } } } // char for-loop - if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; } } // grid-stride for-loop } @@ -739,13 +742,14 @@ struct string_parse { bitmask_type* null_mask; size_type* null_count_data; cudf::io::parse_options_view const options; - size_type* d_offsets{}; + size_type* d_sizes{}; + cudf::detail::input_offsetalator d_offsets; char* d_chars{}; __device__ void operator()(size_type idx) { if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const in_begin = str_tuples[idx].first; @@ -761,7 +765,7 @@ struct string_parse { if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } } @@ -773,9 +777,9 @@ struct string_parse { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); } - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; + if (!d_chars) { d_sizes[idx] = str_process_info.bytes; } } } }; @@ -811,13 +815,12 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type{0}, thrust::maximum{}); - auto offsets = cudf::make_numeric_column( - data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().data(); + auto sizes = rmm::device_uvector(col_size, stream); + auto d_sizes = sizes.data(); auto null_count_data = d_null_count.data(); auto single_thread_fn = string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_sizes}; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -838,7 +841,8 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } @@ -853,20 +857,22 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); + + auto [offsets, bytes] = + cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // CHARS column rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); - single_thread_fn.d_chars = d_chars; + single_thread_fn.d_chars = d_chars; + single_thread_fn.d_offsets = d_offsets; + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -882,6 +888,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } @@ -897,6 +904,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 5d790e73246..57aa2721756 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes) EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type"; i++; } - std::cout << "\n"; }; // json From 582d237e1b07696de86a3f4df16dca2922dda5eb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:55:06 -0400 Subject: [PATCH 25/48] Fix offsetalator when accessing over 268 million rows (#15921) Fixes an access error when the `offsetalator` wraps an INT64 offsets column with more than 268,435,455 rows. The row access type is `size_type` and is used to calculate the appropriate position within the offsets buffer. This fix promotes the multiplication to int64 to properly resolve the correct pointer position. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/15921 --- cpp/include/cudf/detail/offsets_iterator.cuh | 6 +- cpp/tests/CMakeLists.txt | 1 + .../large_strings/large_strings_fixture.cpp | 11 +++ .../large_strings/large_strings_fixture.hpp | 11 +++ .../large_strings/many_strings_tests.cpp | 67 +++++++++++++++++++ 5 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 cpp/tests/large_strings/many_strings_tests.cpp diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh index 15b334245ff..1ab1fd46230 100644 --- a/cpp/include/cudf/detail/offsets_iterator.cuh +++ b/cpp/include/cudf/detail/offsets_iterator.cuh @@ -53,7 +53,7 @@ struct input_offsetalator : base_normalator { */ __device__ inline int64_t operator[](size_type idx) const { - void const* tp = p_ + (idx * this->width_); + void const* tp = p_ + (static_cast(idx) * this->width_); return this->width_ == sizeof(int32_t) ? static_cast(*static_cast(tp)) : *static_cast(tp); } @@ -79,7 +79,7 @@ struct input_offsetalator : base_normalator { cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"); #endif - p_ += (this->width_ * offset); + p_ += (this->width_ * static_cast(offset)); } protected: @@ -121,7 +121,7 @@ struct output_offsetalator : base_normalator { __device__ inline output_offsetalator const operator[](size_type idx) const { output_offsetalator tmp{*this}; - tmp.p_ += (idx * this->width_); + tmp.p_ += (static_cast(idx) * this->width_); return tmp; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a0d9083c4a4..826f879ddc0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -570,6 +570,7 @@ ConfigureTest( large_strings/concatenate_tests.cpp large_strings/case_tests.cpp large_strings/large_strings_fixture.cpp + large_strings/many_strings_tests.cpp large_strings/merge_tests.cpp large_strings/parquet_tests.cpp large_strings/reshape_tests.cpp diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 59e0cd43d05..416b106c5a5 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -95,6 +95,17 @@ cudf::column_view StringsLargeTest::long_column() return g_ls_data->get_column(name); } +cudf::column_view StringsLargeTest::very_long_column() +{ + std::string name("long2"); + if (!g_ls_data->has_key(name)) { + auto itr = thrust::constant_iterator("12345"); + auto input = cudf::test::strings_column_wrapper(itr, itr + 30'000'000); + g_ls_data->add_column(name, input.release()); + } + return g_ls_data->get_column(name); +} + std::unique_ptr StringsLargeTest::get_ls_data() { CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data"); diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp index 8827b65f1ce..fb7b1cd00b8 100644 --- a/cpp/tests/large_strings/large_strings_fixture.hpp +++ b/cpp/tests/large_strings/large_strings_fixture.hpp @@ -33,14 +33,25 @@ class LargeStringsData; struct StringsLargeTest : public cudf::test::BaseFixture { /** * @brief Returns a column of long strings + * + * This returns 8 rows of 400 bytes */ cudf::column_view wide_column(); /** * @brief Returns a long column of strings + * + * This returns 5 million rows of 50 bytes */ cudf::column_view long_column(); + /** + * @brief Returns a very long column of strings + * + * This returns 30 million rows of 5 bytes + */ + cudf::column_view very_long_column(); + large_strings_enabler g_ls_enabler; static LargeStringsData* g_ls_data; diff --git a/cpp/tests/large_strings/many_strings_tests.cpp b/cpp/tests/large_strings/many_strings_tests.cpp new file mode 100644 index 00000000000..73fbb21d014 --- /dev/null +++ b/cpp/tests/large_strings/many_strings_tests.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +struct StringsManyTest : public cudf::test::StringsLargeTest {}; + +TEST_F(StringsManyTest, Replace) +{ + auto const expected = this->very_long_column(); + auto const view = cudf::column_view(expected); + // force addressing (rows > max_size_type/sizeof(int64)) in a 64-bit offsets column + int constexpr max_size_type = std::numeric_limits::max(); + // minimum number of duplicates to achieve large strings (64-bit offsets) + int const min_size_multiplier = + (max_size_type / cudf::strings_column_view(view).chars_size(cudf::get_default_stream())) + 1; + // minimum row multiplier to create max_size_type/sizeof(int64) = 268,435,455 rows + int const min_row_multiplier = ((max_size_type / sizeof(int64_t)) / view.size()) + 1; + int const multiplier = std::max(min_size_multiplier, min_row_multiplier); + + std::vector input_cols(multiplier, view); + std::vector splits; + std::generate_n(std::back_inserter(splits), multiplier - 1, [view, n = 1]() mutable { + return view.size() * (n++); + }); + + auto large_input = cudf::concatenate(input_cols); // 480 million rows + auto const sv = cudf::strings_column_view(large_input->view()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); + + // Using replace tests reading large strings as well as creating large strings + auto const target = cudf::string_scalar("3"); // fake the actual replace; + auto const repl = cudf::string_scalar("3"); // logic still builds the output + auto result = cudf::strings::replace(sv, target, repl); + + // verify results in sections + auto sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected); + } +} From 451d12a2d8d69f63d2b9491286b8895ace6f87ba Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 6 Jun 2024 18:57:04 -0500 Subject: [PATCH 26/48] Allow anonymous user in devcontainer name. (#15784) In https://github.com/rapidsai/cudf/pull/15572, we updated the devcontainer name to include the current user's name. However, in GitHub Codespaces, the username is not defined. As a result, the container name starts with a dash. This is not allowed by GitHub Codespaces, so it fails to launch. This PR adds a default value of `anon` to the devcontainer username. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Paul Taylor (https://github.com/trxcllnt) URL: https://github.com/rapidsai/cudf/pull/15784 --- .devcontainer/cuda11.8-conda/devcontainer.json | 2 +- .devcontainer/cuda11.8-pip/devcontainer.json | 2 +- .devcontainer/cuda12.2-conda/devcontainer.json | 2 +- .devcontainer/cuda12.2-pip/devcontainer.json | 2 +- .github/CODEOWNERS | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index c62e18512a0..8423fe21c29 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 4ab4bd75643..4945d6cf753 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 2b50454410f..05bf9173d25 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index fc5abc56094..74420214726 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 9efac3f1904..5e2f46714d9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -22,7 +22,7 @@ java/ @rapidsai/cudf-java-codeowners /.pre-commit-config.yaml @rapidsai/ci-codeowners #packaging code owners -/.devcontainers/ @rapidsai/packaging-codeowners +/.devcontainer/ @rapidsai/packaging-codeowners /conda/ @rapidsai/packaging-codeowners /dependencies.yaml @rapidsai/packaging-codeowners /build.sh @rapidsai/packaging-codeowners From 9bd16bb719e14ed1e0ee3edbd8c8417c03ac2f25 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:50:23 -0700 Subject: [PATCH 27/48] Reland "Fix docs for IO readers and strings_convert" (#15872)" (#15941) This reverts commit 2b031e06a7fe18eec462db445eea1c596b93a9f1. We got the go ahead to remove the text docs from @taureandyernv. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15941 --- ci/build_docs.sh | 6 ------ docs/cudf/source/libcudf_docs/api_docs/io_readers.rst | 2 +- docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index db306046667..67a5415f353 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -46,9 +46,6 @@ pushd docs/cudf make dirhtml mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html" -make text -mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt" -mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt" popd rapids-logger "Build dask-cuDF Sphinx docs" @@ -56,9 +53,6 @@ pushd docs/dask_cudf make dirhtml mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" -make text -mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt" -mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt" popd rapids-upload-docs diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst index a835673dee4..f94a5ddb403 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst @@ -2,4 +2,4 @@ Io Readers ========== .. doxygengroup:: io_readers - :desc-only: + :members: diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst index ae5d78fb1a1..f2f320bd0e4 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst @@ -2,4 +2,4 @@ Strings Convert =============== .. doxygengroup:: strings_convert - :desc-only: + :members: From d83d086afda1d25f5711a0aecf4ecfe6c05f7b9d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 07:30:32 -1000 Subject: [PATCH 28/48] Define Column.nan_as_null to return self (#15923) While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable. This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15923 --- python/cudf/cudf/core/_base_index.py | 7 +---- python/cudf/cudf/core/column/categorical.py | 6 ++-- python/cudf/cudf/core/column/column.py | 14 +++++---- python/cudf/cudf/core/column/numerical.py | 6 ++-- .../cudf/cudf/core/column/numerical_base.py | 4 +-- python/cudf/cudf/core/indexed_frame.py | 29 ++++++------------- python/cudf/cudf/core/reshape.py | 4 +-- python/cudf/cudf/tests/test_replace.py | 8 +++++ python/cudf/cudf/tests/test_series.py | 7 +++++ 9 files changed, 42 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index baca7b19e58..5d0f7c4ede4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2072,12 +2072,7 @@ def dropna(self, how="any"): pass # This is to be consistent with IndexedFrame.dropna to handle nans # as nulls by default - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( drop_nulls( diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 1828c5ce97b..de20b2ace1d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -816,10 +816,8 @@ def to_pandas( .values_host ) - cats = col.categories - if cats.dtype.kind in "biuf": - cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] - elif not isinstance(cats.dtype, IntervalDtype): + cats = col.categories.nans_to_nulls() + if not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 68079371b85..475d52d0fbb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self) -> ColumnBase: + def dropna(self) -> Self: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -695,7 +695,9 @@ def fillna( Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=fill_value, method=method + input_col=self.nans_to_nulls(), + replacement=fill_value, + method=method, )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str): f"Operation {unaryop} not supported for dtype {self.dtype}." ) + def nans_to_nulls(self: Self) -> Self: + """Convert NaN to NA.""" + return self + def normalize_binop_value( self, other: ScalarLike ) -> Union[ColumnBase, ScalarLike]: @@ -1802,9 +1808,7 @@ def as_column( data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) - if ( - nan_as_null or (mask is None and nan_as_null is None) - ) and col.dtype.kind == "f": + if nan_as_null or (mask is None and nan_as_null is None): col = col.nans_to_nulls() if dtype is not None: col = col.astype(dtype) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index fb413959eb9..6fb4f17b76d 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -536,7 +536,7 @@ def fillna( return col if method is not None: - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if fill_value is None: raise ValueError("Must specify either 'fill_value' or 'method'") @@ -545,7 +545,7 @@ def fillna( isinstance(fill_value, cudf.Scalar) and fill_value.dtype == col.dtype ): - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) if np.isscalar(fill_value): # cast safely to the same dtype as self @@ -572,7 +572,7 @@ def fillna( else: fill_value = fill_value.astype(col.dtype) - return super(NumericalColumn, col).fillna(fill_value, method) + return super().fillna(fill_value, method) def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 541c32a2520..d38ec9cf30f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike: if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() # type: ignore + self = self.nans_to_nulls().dropna() if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ecfcec15337..d898eb4b9c3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True): results = {} for name, col in self._data.items(): if skipna: - try: - result_col = col.nans_to_nulls() - except AttributeError: - result_col = col + result_col = col.nans_to_nulls() else: if col.has_nulls(include_nan=True): first_index = col.isnull().find_first_value(True) @@ -1915,12 +1912,12 @@ def nans_to_nulls(self): 1 3.14 2 """ - result = ( - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col.copy() - for col in self._data.columns - ) + result = [] + for col in self._data.columns: + converted = col.nans_to_nulls() + if converted is col: + converted = converted.copy() + result.append(converted) return self._from_data_like_self( self._data._from_columns_like_self(result) ) @@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): thresh = len(df) for name, col in df._data.items(): - try: - check_col = col.nans_to_nulls() - except AttributeError: - check_col = col + check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count ) < thresh @@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): if len(subset) == 0: return self.copy(deep=True) - data_columns = [ - col.nans_to_nulls() - if isinstance(col, cudf.core.column.NumericalColumn) - else col - for col in self._columns - ] + data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index d4772d5b4c2..53239cb7ea0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na): else: unique = column.unique().sort_values() if not dummy_na: - if np.issubdtype(unique.dtype, np.floating): - unique = unique.nans_to_nulls() - unique = unique.dropna() + unique = unique.nans_to_nulls().dropna() return unique diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d77ec596271..9466398964a 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex(): actual = gdf.fillna(10) assert_eq(expected, actual) + + +def test_fillna_nan_and_null(): + ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) + result = ser.fillna(2.2) + expected = cudf.Series([2.2, 2.2, 1.1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 323716d5fc3..f47c42d9a1d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy(): ser1 = cudf.Series(range(3), index=list("abc")) ser2 = cudf.Series(ser1) assert ser1.index is ser2.index + + +@pytest.mark.parametrize("value", [1, 1.1]) +def test_nans_to_nulls_noop_copies_column(value): + ser1 = cudf.Series([value]) + ser2 = ser1.nans_to_nulls() + assert ser1._column is not ser2._column From 39c5b86645dc61bf0c59d7bf733ca13872b46a44 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:53:53 -0700 Subject: [PATCH 29/48] Handling for `NaN` and `inf` when converting floating point to fixed point types (#15885) This PR adds the ability to check for `NaN` and `inf` values when converting floating point types to fixed point types. For these input values, the corresponding output will be `null`. Closes https://github.com/rapidsai/cudf/issues/15883. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/15885 --- cpp/src/unary/cast_ops.cu | 43 ++++++++++++++++++++++++++++++++-- cpp/tests/unary/cast_tests.cpp | 21 +++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 98c412f805d..64427326d87 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -15,11 +15,13 @@ */ #include +#include #include #include #include #include #include +#include #include #include #include @@ -219,6 +221,28 @@ std::unique_ptr rescale(column_view input, } }; +/** + * @brief Check if a floating point value is convertible to fixed point type. + * + * A floating point value is convertible if it is not null, not `NaN`, and not `inf`. + * + * Note that convertible input values may be out of the representable range of the target fixed + * point type. Values out of the representable range need to be checked separately. + */ +template +struct is_convertible_floating_point { + column_device_view d_input; + + bool __device__ operator()(size_type idx) const + { + static_assert(std::is_floating_point_v); + + if (d_input.is_null(idx)) { return false; } + auto const value = d_input.element(idx); + return std::isfinite(value); + } +}; + template struct dispatch_unary_cast_to { column_view input; @@ -294,8 +318,8 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - detail::copy_bitmask(input, stream, mr), - input.null_count()); + rmm::device_buffer{}, + 0); mutable_column_view output_mutable = *output; @@ -308,6 +332,21 @@ struct dispatch_unary_cast_to { output_mutable.begin(), fixed_point_unary_cast{scale}); + if constexpr (cudf::is_floating_point()) { + // For floating-point values, beside input nulls, we also need to set nulls for the output + // rows corresponding to NaN and inf in the input. + auto const d_input_ptr = column_device_view::create(input, stream); + auto [null_mask, null_count] = + cudf::detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(size), + is_convertible_floating_point{*d_input_ptr}, + stream, + mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + } else { + output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count()); + } + return output; } diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index a82449ffc10..ebeafc82039 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -665,6 +665,27 @@ TYPED_TEST(FixedPointTests, CastFromDouble) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTests, CastFromDoubleWithNaNAndInf) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + using fw_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const NaN = std::numeric_limits::quiet_NaN(); + auto const inf = std::numeric_limits::infinity(); + auto const null = 0; + + auto const input = fw_wrapper{1.729, -inf, NaN, 172.9, -inf, NaN, inf, 1.23, inf}; + auto const expected = fp_wrapper{{1729, null, null, 172900, null, null, null, 1230, null}, + {true, false, false, true, false, false, false, true, false}, + scale_type{-3}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(-3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + TYPED_TEST(FixedPointTests, CastFromDoubleLarge) { using namespace numeric; From 0067444597127f23a09a349f1c97dc33b9ec3958 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:10:22 -0400 Subject: [PATCH 30/48] cudf.pandas documentation improvement (#15948) Added some more about the generality of the fast-slow proxy scheme from a suggestion from @wence- Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15948 --- docs/cudf/source/developer_guide/cudf_pandas.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index aeb43f66b2d..827ba18a4a4 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -3,8 +3,16 @@ The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself. ## fast-slow proxy mechanism -`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type. -The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails. +The core of `cudf.pandas` is implemented through proxy types defined in [`fast_slow_proxy.py`](https://github.com/rapidsai/cudf/blob/5f45803b2a68b49d330d94e2f701791a7590612a/python/cudf/cudf/pandas/fast_slow_proxy.py), which link a pair of "fast" and "slow" libraries. +`cudf.pandas` works by wrapping each "slow" type and its corresponding "fast" type in a new proxy type, also known as a fast-slow proxy type. +The purpose of these proxy types is so we can first attempt computations on the fast object, and then fall back to the slow object if the fast version fails. +While the core wrapping functionality is generic, the current usage mainly involves providing a proxy pair using cuDF and Pandas. +In the rest of this document, to maintain a concrete pair of libraries in mind, we use cuDF and Pandas interchangeably as names for the "fast" and "slow" libraries, respectively, with the understanding that any pair of API-matching libraries could be used. +For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library). + +```{note} +We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type. +``` ### Types: #### Wrapped Types and Proxy Types From 139ed6c3085feac8116085e35c7897cad141ce69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 10:49:05 -1000 Subject: [PATCH 31/48] Add __array_interface__ to cudf.pandas numpy.ndarray proxy (#15936) closes #15926 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/15936 --- python/cudf/cudf/pandas/_wrappers/common.py | 5 +++++ python/cudf/cudf/pandas/_wrappers/numpy.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 468c5687c15..66a51a83896 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -46,5 +46,10 @@ def cuda_array_interface(self: _FastSlowProxy): return self._fsproxy_fast.__cuda_array_interface__ +@property # type: ignore +def array_interface(self: _FastSlowProxy): + return self._fsproxy_slow.__array_interface__ + + def custom_iter(self: _FastSlowProxy): return iter(self._fsproxy_slow) diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index 94298872213..c445be46f58 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -15,6 +15,7 @@ make_intermediate_proxy_type, ) from .common import ( + array_interface, array_method, arrow_array_method, cuda_array_interface, @@ -115,6 +116,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor): # So that pa.array(wrapped-numpy-array) works "__arrow_array__": arrow_array_method, "__cuda_array_interface__": cuda_array_interface, + "__array_interface__": array_interface, # ndarrays are unhashable "__hash__": None, # iter(cupy-array) produces an iterable of zero-dim device From 8e40fe7e6b01a399c3ea406a59d4cbcbc9bfce5c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 16:08:42 -0700 Subject: [PATCH 32/48] Remove unused parsing utilities (#15955) Some parsing utilities have been unused since legacy JSON removal. This PR removes these functions. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15955 --- cpp/CMakeLists.txt | 1 - cpp/src/io/utilities/parsing_utils.cu | 221 ------------------------- cpp/src/io/utilities/parsing_utils.cuh | 76 --------- 3 files changed, 298 deletions(-) delete mode 100644 cpp/src/io/utilities/parsing_utils.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f637db66c2c..ca85996b990 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -439,7 +439,6 @@ add_library( src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp - src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp src/io/utilities/type_inference.cu src/io/utilities/trie.cu diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu deleted file mode 100644 index cb8be380c5b..00000000000 --- a/cpp/src/io/utilities/parsing_utils.cu +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace { -// When processing the input in chunks, this is the maximum size of each chunk. -// Only one chunk is loaded on the GPU at a time, so this value is chosen to -// be small enough to fit on the GPU in most cases. -constexpr size_t max_chunk_bytes = 256 * 1024 * 1024; // 256MB - -constexpr int bytes_per_find_thread = 64; - -using pos_key_pair = thrust::pair; - -template -constexpr T divCeil(T dividend, T divisor) noexcept -{ - return (dividend + divisor - 1) / divisor; -} - -/** - * @brief Sets the specified element of the array to the passed value - */ -template -__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&) -{ - array[idx] = t; -} - -/** - * @brief Sets the specified element of the array of pairs using the two passed - * parameters. - */ -template -__device__ __forceinline__ void setElement(thrust::pair* array, - cudf::size_type idx, - T const& t, - V const& v) -{ - array[idx] = {t, v}; -} - -/** - * @brief Overloads the setElement() functions for void* arrays. - * Does not do anything, indexing is not allowed with void* arrays. - */ -template -__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&) -{ -} - -/** - * @brief CUDA kernel that finds all occurrences of a character in the given - * character array. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output array. - * - * @param[in] data Pointer to the input character array - * @param[in] size Number of bytes in the input array - * @param[in] offset Offset to add to the output positions - * @param[in] key Character to find in the array - * @param[in,out] count Pointer to the number of found occurrences - * @param[out] positions Array containing the output positions - */ -template -CUDF_KERNEL void count_and_set_positions(char const* data, - uint64_t size, - uint64_t offset, - char const key, - cudf::size_type* count, - T* positions) -{ - // thread IDs range per block, so also need the block id - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const did = tid * bytes_per_find_thread; - - char const* raw = (data + did); - - long const byteToProcess = - ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did); - - // Process the data - for (long i = 0; i < byteToProcess; i++) { - if (raw[i] == key) { - auto const idx = atomicAdd(count, static_cast(1)); - setElement(positions, idx, did + offset + i, key); - } - } -} - -} // namespace - -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - int const grid_size = divCeil(data.size(), (size_t)block_size); - - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - for (char key : keys) { - count_and_set_positions<<>>( - data.data(), data.size(), result_offset, key, d_count.data(), positions); - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream); - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - - size_t const chunk_count = divCeil(data.size(), max_chunk_bytes); - for (size_t ci = 0; ci < chunk_count; ++ci) { - auto const chunk_offset = ci * max_chunk_bytes; - auto const h_chunk = data.data() + chunk_offset; - int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes); - auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread); - int const grid_size = divCeil(chunk_bits, block_size); - - // Copy chunk to device - CUDF_CUDA_TRY( - cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value())); - - for (char key : keys) { - count_and_set_positions - <<>>(static_cast(d_chunk.data()), - chunk_bytes, - chunk_offset + result_offset, - key, - d_count.data(), - positions); - } - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index faee05541cc..bc2722441d0 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const* } // namespace gpu -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * @param[in] d_data Input character array in device memory - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * Does not load the entire file into the GPU memory at any time, so it can - * be used to parse large files. Output array needs to be preallocated. - * - * @param[in] h_data Pointer to the input character array - * @param[in] h_size Number of bytes in the input array - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * @param d_data Input data buffer in device memory - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * Does not load the entire buffer into the GPU memory at any time, so it can - * be used with buffers of any size. - * - * @param h_data Pointer to the data in host memory - * @param h_size Size of the input data, in bytes - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - /** * @brief Checks whether the given character is a whitespace character. * From bfad68c66fba06cb87327265b8b74ab329c58e4e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Sun, 9 Jun 2024 09:17:12 -0400 Subject: [PATCH 33/48] Add an Environment Variable for debugging the fast path in cudf.pandas (#15837) Part of #14975 This PR adds a pandas debugging option to `_fast_slow_function_call` that runs the slow path after the fast and returns a warning if the results differ. Authors: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15837 --- python/cudf/cudf/pandas/fast_slow_proxy.py | 63 ++++++++++++++++-- .../cudf_pandas_tests/test_cudf_pandas.py | 64 ++++++++++++++++++- 2 files changed, 121 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 169dd80e132..5f4cf2e6cc6 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -9,6 +9,7 @@ import operator import pickle import types +import warnings from collections.abc import Iterator from enum import IntEnum from typing import ( @@ -23,6 +24,10 @@ Type, ) +import numpy as np + +from ..options import _env_get_bool +from ..testing._utils import assert_eq from .annotation import nvtx @@ -808,7 +813,9 @@ def __get__(self, instance, owner) -> Any: else: # for anything else, use a fast-slow attribute: self._attr, _ = _fast_slow_function_call( - getattr, owner, self._name + getattr, + owner, + self._name, ) if isinstance( @@ -829,9 +836,11 @@ def __get__(self, instance, owner) -> Any: getattr(instance._fsproxy_slow, self._name), None, # type: ignore ) - return _fast_slow_function_call(getattr, instance, self._name)[ - 0 - ] + return _fast_slow_function_call( + getattr, + instance, + self._name, + )[0] return self._attr @@ -866,7 +875,17 @@ def __name__(self, value): setattr(self._fsproxy_slow, "__name__", value) -def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any: +def _assert_fast_slow_eq(left, right): + if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: + assert_eq(left, right) + + +def _fast_slow_function_call( + func: Callable, + /, + *args, + **kwargs, +) -> Any: """ Call `func` with all `args` and `kwargs` converted to their respective fast type. If that fails, call `func` with all @@ -890,6 +909,37 @@ def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any: # try slow path raise Exception() fast = True + if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): + try: + with nvtx.annotate( + "EXECUTE_SLOW_DEBUG", + color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], + domain="cudf_pandas", + ): + slow_args, slow_kwargs = ( + _slow_arg(args), + _slow_arg(kwargs), + ) + with disable_module_accelerator(): + slow_result = func(*slow_args, **slow_kwargs) + except Exception as e: + warnings.warn( + "The result from pandas could not be computed. " + f"The exception was {e}." + ) + else: + try: + _assert_fast_slow_eq(result, slow_result) + except AssertionError as e: + warnings.warn( + "The results from cudf and pandas were different. " + f"The exception was {e}." + ) + except Exception as e: + warnings.warn( + "Pandas debugging mode failed. " + f"The exception was {e}." + ) except Exception: with nvtx.annotate( "EXECUTE_SLOW", @@ -1135,6 +1185,9 @@ def _replace_closurevars( ) +NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) + + _SPECIAL_METHODS: Set[str] = { "__abs__", "__add__", diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index fef829b17fc..72e9ad5fca3 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -41,8 +41,9 @@ get_calendar, ) -# Accelerated pandas has the real pandas module as an attribute +# Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow +cudf = xpd._fsproxy_fast @pytest.fixture @@ -1424,5 +1425,66 @@ def test_holidays_within_dates(holiday, start, expected): ) == [utc.localize(dt) for dt in expected] +def test_cudf_pandas_debugging_different_results(monkeypatch): + cudf_mean = cudf.Series.mean + + def mock_mean_one(self, *args, **kwargs): + return np.float64(1.0) + + with monkeypatch.context() as monkeycontext: + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", mock_mean_one) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="The results from cudf and pandas were different.", + ): + assert s.mean() == 1.0 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", cudf_mean) + + +def test_cudf_pandas_debugging_pandas_error(monkeypatch): + pd_mean = pd.Series.mean + + def mock_mean_exception(self, *args, **kwargs): + raise Exception() + + with monkeypatch.context() as monkeycontext: + monkeycontext.setattr( + xpd.Series.mean, "_fsproxy_slow", mock_mean_exception + ) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="The result from pandas could not be computed.", + ): + s = xpd.Series([1, 2]) + assert s.mean() == 1.5 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean) + + +def test_cudf_pandas_debugging_failed(monkeypatch): + pd_mean = pd.Series.mean + + def mock_mean_none(self, *args, **kwargs): + return None + + with monkeypatch.context() as monkeycontext: + monkeycontext.setattr(xpd.Series.mean, "_fsproxy_slow", mock_mean_none) + monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True") + s = xpd.Series([1, 2]) + with pytest.warns( + UserWarning, + match="Pandas debugging mode failed.", + ): + s = xpd.Series([1, 2]) + assert s.mean() == 1.5 + # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts. + monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean) + + def test_excelwriter_pathlike(): assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) From c02260f2fb1c162eabf0da0604cc6f08f2cc74ff Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Sun, 9 Jun 2024 22:09:44 -0700 Subject: [PATCH 34/48] Refactor Parquet writer options and builders (#15831) Adding options to the Parquet writer is made somewhat tedious by the duplication of code between the two current sets of options/builder classes; one each for the chunked and non-chunked Parquet writers. This PR pulls common options into a parent options class, and common setters into a parent builder class. The builder parent uses CRTP to allow chaining of options. Authors: - Ed Seidl (https://github.com/etseidl) - Vyas Ramasubramani (https://github.com/vyasr) - Mike Wilson (https://github.com/hyperbolic2346) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/15831 --- cpp/include/cudf/io/parquet.hpp | 906 ++++-------------- cpp/src/io/functions.cpp | 271 ++++-- .../_lib/pylibcudf/libcudf/io/parquet.pxd | 173 ++-- 3 files changed, 410 insertions(+), 940 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index b2f949cdcee..51eeed5b721 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include namespace cudf::io { @@ -576,22 +577,16 @@ struct sorting_column { bool is_nulls_first{true}; //!< true if nulls come before non-null values }; -class parquet_writer_options_builder; - /** - * @brief Settings for `write_parquet()`. + * @brief Base settings for `write_parquet()` and `parquet_chunked_writer`. */ -class parquet_writer_options { +class parquet_writer_options_base { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use compression_type _compression = compression_type::SNAPPY; // Specify the level of statistics in the output file statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; - // Sets of columns to output - table_view _table; - // Partitions described as {start_row, num_rows} pairs - std::vector _partitions; // Optional associated metadata std::optional _metadata; // Optional footer key_value_metadata @@ -602,8 +597,6 @@ class parquet_writer_options { // Parquet writer can write timestamps as UTC // Defaults to true because libcudf timestamps are implicitly UTC bool _write_timestamps_as_UTC = true; - // Column chunks file paths to be set in the raw output metadata. One per output file - std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -627,18 +620,13 @@ class parquet_writer_options { // Which columns in _table are used for sorting std::optional> _sorting_columns; + protected: /** - * @brief Constructor from sink and table. + * @brief Constructor from sink. * * @param sink The sink used for writer output - * @param table Table to be written to output */ - explicit parquet_writer_options(sink_info const& sink, table_view const& table) - : _sink(sink), _table(table) - { - } - - friend parquet_writer_options_builder; + explicit parquet_writer_options_base(sink_info const& sink) : _sink(sink) {} public: /** @@ -646,24 +634,7 @@ class parquet_writer_options { * * This has been added since Cython requires a default constructor to create objects on stack. */ - parquet_writer_options() = default; - - /** - * @brief Create builder to create `parquet_writer_options`. - * - * @param sink The sink used for writer output - * @param table Table to be written to output - * - * @return Builder to build parquet_writer_options - */ - static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table); - - /** - * @brief Create builder to create `parquet_writer_options`. - * - * @return parquet_writer_options_builder - */ - static parquet_writer_options_builder builder(); + parquet_writer_options_base() = default; /** * @brief Returns sink info. @@ -686,20 +657,6 @@ class parquet_writer_options { */ [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } - /** - * @brief Returns table_view. - * - * @return Table view - */ - [[nodiscard]] table_view get_table() const { return _table; } - - /** - * @brief Returns partitions. - * - * @return Partitions - */ - [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } - /** * @brief Returns associated metadata. * @@ -712,7 +669,8 @@ class parquet_writer_options { * * @return Key-Value footer metadata information */ - std::vector> const& get_key_value_metadata() const + [[nodiscard]] std::vector> const& get_key_value_metadata() + const { return _user_data; } @@ -722,7 +680,7 @@ class parquet_writer_options { * * @return `true` if timestamps will be written as INT96 */ - bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } + [[nodiscard]] bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } /** * @brief Returns `true` if timestamps will be written as UTC @@ -731,29 +689,19 @@ class parquet_writer_options { */ [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } - /** - * @brief Returns Column chunks file paths to be set in the raw output metadata. - * - * @return Column chunks file paths to be set in the raw output metadata - */ - std::vector const& get_column_chunks_file_paths() const - { - return _column_chunks_file_paths; - } - /** * @brief Returns maximum row group size, in bytes. * * @return Maximum row group size, in bytes */ - auto get_row_group_size_bytes() const { return _row_group_size_bytes; } + [[nodiscard]] auto get_row_group_size_bytes() const { return _row_group_size_bytes; } /** * @brief Returns maximum row group size, in rows. * * @return Maximum row group size, in rows */ - auto get_row_group_size_rows() const { return _row_group_size_rows; } + [[nodiscard]] auto get_row_group_size_rows() const { return _row_group_size_rows; } /** * @brief Returns the maximum uncompressed page size, in bytes. @@ -762,7 +710,7 @@ class parquet_writer_options { * * @return Maximum uncompressed page size, in bytes */ - auto get_max_page_size_bytes() const + [[nodiscard]] auto get_max_page_size_bytes() const { return std::min(_max_page_size_bytes, get_row_group_size_bytes()); } @@ -774,7 +722,7 @@ class parquet_writer_options { * * @return Maximum page size, in rows */ - auto get_max_page_size_rows() const + [[nodiscard]] auto get_max_page_size_rows() const { return std::min(_max_page_size_rows, get_row_group_size_rows()); } @@ -784,7 +732,10 @@ class parquet_writer_options { * * @return length min/max will be truncated to */ - auto get_column_index_truncate_length() const { return _column_index_truncate_length; } + [[nodiscard]] auto get_column_index_truncate_length() const + { + return _column_index_truncate_length; + } /** * @brief Returns policy for dictionary use. @@ -831,20 +782,12 @@ class parquet_writer_options { */ [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; } - /** - * @brief Sets partitions. - * - * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must - * be same size as number of sinks in sink_info - */ - void set_partitions(std::vector partitions); - /** * @brief Sets metadata. * * @param metadata Associated metadata */ - void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } + void set_metadata(table_input_metadata metadata); /** * @brief Sets metadata. @@ -858,14 +801,13 @@ class parquet_writer_options { * * @param sf Level of statistics requested in the output file */ - void set_stats_level(statistics_freq sf) { _stats_level = sf; } - + void set_stats_level(statistics_freq sf); /** * @brief Sets compression type. * * @param compression The compression type to use */ - void set_compression(compression_type compression) { _compression = compression; } + void set_compression(compression_type compression); /** * @brief Sets timestamp writing preferences. INT96 timestamps will be written @@ -873,22 +815,14 @@ class parquet_writer_options { * * @param req Boolean value to enable/disable writing of INT96 timestamps */ - void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + void enable_int96_timestamps(bool req); /** * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. * * @param val Boolean value to enable/disable writing of timestamps as UTC. */ - void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } - - /** - * @brief Sets column chunks file path to be set in the raw output metadata. - * - * @param file_paths Vector of Strings which indicates file path. Must be same size as number of - * data sinks in sink info - */ - void set_column_chunks_file_paths(std::vector file_paths); + void enable_utc_timestamps(bool val); /** * @brief Sets the maximum row group size, in bytes. @@ -951,116 +885,84 @@ class parquet_writer_options { * * @param comp_stats Pointer to compression statistics to be updated after writing */ - void set_compression_statistics(std::shared_ptr comp_stats) - { - _compression_stats = std::move(comp_stats); - } + void set_compression_statistics(std::shared_ptr comp_stats); /** * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. * * @param val Boolean value to enable/disable writing of V2 page headers. */ - void enable_write_v2_headers(bool val) { _v2_page_headers = val; } + void enable_write_v2_headers(bool val); /** * @brief Sets sorting columns. * * @param sorting_columns Column sort order metadata */ - void set_sorting_columns(std::vector sorting_columns) - { - _sorting_columns = std::move(sorting_columns); - } + void set_sorting_columns(std::vector sorting_columns); }; /** - * @brief Class to build `parquet_writer_options`. + * @brief Base class for Parquet options builders. */ -class parquet_writer_options_builder { - parquet_writer_options options; +template +class parquet_writer_options_builder_base { + OptionsT _options; - public: + protected: /** - * @brief Default constructor. + * @brief Return reference to the options object being built * - * This has been added since Cython requires a default constructor to create objects on stack. + * @return the options object */ - explicit parquet_writer_options_builder() = default; + inline OptionsT& get_options() { return _options; } /** - * @brief Constructor from sink and table. + * @brief Constructor from options. * - * @param sink The sink used for writer output - * @param table Table to be written to output + * @param options Options object to build */ - explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table) - : options(sink, table) - { - } + explicit parquet_writer_options_builder_base(OptionsT options); + public: /** - * @brief Sets partitions in parquet_writer_options. + * @brief Default constructor. * - * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must - * be same size as number of sinks in sink_info - * @return this for chaining + * This has been added since Cython requires a default constructor to create objects on stack. */ - parquet_writer_options_builder& partitions(std::vector partitions); + explicit parquet_writer_options_builder_base() = default; /** - * @brief Sets metadata in parquet_writer_options. + * @brief Sets metadata. * * @param metadata Associated metadata * @return this for chaining */ - parquet_writer_options_builder& metadata(table_input_metadata metadata) - { - options._metadata = std::move(metadata); - return *this; - } + BuilderT& metadata(table_input_metadata metadata); /** - * @brief Sets Key-Value footer metadata in parquet_writer_options. + * @brief Sets Key-Value footer metadata. * * @param metadata Key-Value footer metadata * @return this for chaining */ - parquet_writer_options_builder& key_value_metadata( - std::vector> metadata); + BuilderT& key_value_metadata(std::vector> metadata); /** - * @brief Sets the level of statistics in parquet_writer_options. + * @brief Sets the level of statistics. * * @param sf Level of statistics requested in the output file * @return this for chaining */ - parquet_writer_options_builder& stats_level(statistics_freq sf) - { - options._stats_level = sf; - return *this; - } + BuilderT& stats_level(statistics_freq sf); /** - * @brief Sets compression type in parquet_writer_options. + * @brief Sets compression type. * * @param compression The compression type to use * @return this for chaining */ - parquet_writer_options_builder& compression(compression_type compression) - { - options._compression = compression; - return *this; - } - - /** - * @brief Sets column chunks file path to be set in the raw output metadata. - * - * @param file_paths Vector of Strings which indicates file path. Must be same size as number of - * data sinks - * @return this for chaining - */ - parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths); + BuilderT& compression(compression_type compression); /** * @brief Sets the maximum row group size, in bytes. @@ -1068,11 +970,7 @@ class parquet_writer_options_builder { * @param val maximum row group size * @return this for chaining */ - parquet_writer_options_builder& row_group_size_bytes(size_t val) - { - options.set_row_group_size_bytes(val); - return *this; - } + BuilderT& row_group_size_bytes(size_t val); /** * @brief Sets the maximum number of rows in output row groups. @@ -1080,11 +978,7 @@ class parquet_writer_options_builder { * @param val maximum number or rows * @return this for chaining */ - parquet_writer_options_builder& row_group_size_rows(size_type val) - { - options.set_row_group_size_rows(val); - return *this; - } + BuilderT& row_group_size_rows(size_type val); /** * @brief Sets the maximum uncompressed page size, in bytes. @@ -1096,11 +990,7 @@ class parquet_writer_options_builder { * @param val maximum page size * @return this for chaining */ - parquet_writer_options_builder& max_page_size_bytes(size_t val) - { - options.set_max_page_size_bytes(val); - return *this; - } + BuilderT& max_page_size_bytes(size_t val); /** * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. @@ -1109,11 +999,7 @@ class parquet_writer_options_builder { * @param val maximum rows per page * @return this for chaining */ - parquet_writer_options_builder& max_page_size_rows(size_type val) - { - options.set_max_page_size_rows(val); - return *this; - } + BuilderT& max_page_size_rows(size_type val); /** * @brief Sets the desired maximum size in bytes for min and max values in the column index. @@ -1128,11 +1014,7 @@ class parquet_writer_options_builder { * @param val length min/max will be truncated to, with 0 indicating no truncation * @return this for chaining */ - parquet_writer_options_builder& column_index_truncate_length(int32_t val) - { - options.set_column_index_truncate_length(val); - return *this; - } + BuilderT& column_index_truncate_length(int32_t val); /** * @brief Sets the policy for dictionary use. @@ -1151,7 +1033,7 @@ class parquet_writer_options_builder { * @param val policy for dictionary use * @return this for chaining */ - parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val); + BuilderT& dictionary_policy(enum dictionary_policy val); /** * @brief Sets the maximum dictionary size, in bytes. @@ -1164,7 +1046,7 @@ class parquet_writer_options_builder { * @param val maximum dictionary size * @return this for chaining */ - parquet_writer_options_builder& max_dictionary_size(size_t val); + BuilderT& max_dictionary_size(size_t val); /** * @brief Sets the maximum page fragment size, in rows. @@ -1176,7 +1058,7 @@ class parquet_writer_options_builder { * @param val maximum page fragment size * @return this for chaining */ - parquet_writer_options_builder& max_page_fragment_size(size_type val); + BuilderT& max_page_fragment_size(size_type val); /** * @brief Sets the pointer to the output compression statistics. @@ -1184,24 +1066,16 @@ class parquet_writer_options_builder { * @param comp_stats Pointer to compression statistics to be filled once writer is done * @return this for chaining */ - parquet_writer_options_builder& compression_statistics( - std::shared_ptr const& comp_stats) - { - options._compression_stats = comp_stats; - return *this; - } + BuilderT& compression_statistics( + std::shared_ptr const& comp_stats); /** - * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. + * @brief Sets whether int96 timestamps are written or not. * * @param enabled Boolean value to enable/disable int96 timestamps * @return this for chaining */ - parquet_writer_options_builder& int96_timestamps(bool enabled) - { - options._write_timestamps_as_int96 = enabled; - return *this; - } + BuilderT& int96_timestamps(bool enabled); /** * @brief Set to true if timestamps are to be written as UTC. @@ -1209,126 +1083,60 @@ class parquet_writer_options_builder { * @param enabled Boolean value to enable/disable writing of timestamps as UTC. * @return this for chaining */ - parquet_writer_options_builder& utc_timestamps(bool enabled) - { - options._write_timestamps_as_UTC = enabled; - return *this; - } - + BuilderT& utc_timestamps(bool enabled); /** * @brief Set to true if V2 page headers are to be written. * * @param enabled Boolean value to enable/disable writing of V2 page headers. * @return this for chaining */ - parquet_writer_options_builder& write_v2_headers(bool enabled); + BuilderT& write_v2_headers(bool enabled); /** - * @brief Sets column sorting metadata to chunked_parquet_writer_options. + * @brief Sets column sorting metadata. * * @param sorting_columns Column sort order metadata * @return this for chaining */ - parquet_writer_options_builder& sorting_columns(std::vector sorting_columns); + BuilderT& sorting_columns(std::vector sorting_columns); /** - * @brief move parquet_writer_options member once it's built. + * @brief move options member once it's built. */ - operator parquet_writer_options&&() { return std::move(options); } + operator OptionsT&&(); /** - * @brief move parquet_writer_options member once it's built. + * @brief move options member once it's built. * * This has been added since Cython does not support overloading of conversion operators. * * @return Built `parquet_writer_options` object's r-value reference */ - parquet_writer_options&& build() { return std::move(options); } + OptionsT&& build(); }; -/** - * @brief Writes a set of columns to parquet format. - * - * The following code snippet demonstrates how to write columns to a file: - * @code - * auto destination = cudf::io::sink_info("dataset.parquet"); - * auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); - * cudf::io::write_parquet(options); - * @endcode - * - * @param options Settings for controlling writing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if - * requested in parquet_writer_options (empty blob otherwise). - */ - -std::unique_ptr> write_parquet( - parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); +class parquet_writer_options_builder; /** - * @brief Merges multiple raw metadata blobs that were previously created by write_parquet - * into a single metadata blob. - * - * @ingroup io_writers - * - * @param[in] metadata_list List of input file metadata - * @return A parquet-compatible blob that contains the data for all row groups in the list + * @brief Settings for `write_parquet()`. */ -std::unique_ptr> merge_row_group_metadata( - std::vector>> const& metadata_list); - -class chunked_parquet_writer_options_builder; +class parquet_writer_options : public parquet_writer_options_base { + // Sets of columns to output + table_view _table; + // Partitions described as {start_row, num_rows} pairs + std::vector _partitions; + // Column chunks file paths to be set in the raw output metadata. One per output file + std::vector _column_chunks_file_paths; -/** - * @brief Settings for `write_parquet_chunked()`. - */ -class chunked_parquet_writer_options { - // Specify the sink to use for writer output - sink_info _sink; - // Specify the compression format to use - compression_type _compression = compression_type::AUTO; - // Specify the level of statistics in the output file - statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; - // Optional associated metadata. - std::optional _metadata; - // Optional footer key_value_metadata - std::vector> _user_data; - // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. - // If true then overrides any per-column setting in _metadata. - bool _write_timestamps_as_int96 = false; - // Parquet writer can write timestamps as UTC. Defaults to true. - bool _write_timestamps_as_UTC = true; - // Maximum size of each row group (unless smaller than a single page) - size_t _row_group_size_bytes = default_row_group_size_bytes; - // Maximum number of rows in row group (unless smaller than a single page) - size_type _row_group_size_rows = default_row_group_size_rows; - // Maximum size of each page (uncompressed) - size_t _max_page_size_bytes = default_max_page_size_bytes; - // Maximum number of rows in a page - size_type _max_page_size_rows = default_max_page_size_rows; - // Maximum size of min or max values in column index - int32_t _column_index_truncate_length = default_column_index_truncate_length; - // When to use dictionary encoding for data - dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE; - // Maximum size of column chunk dictionary (in bytes) - size_t _max_dictionary_size = default_max_dictionary_size; - // Maximum number of rows in a page fragment - std::optional _max_page_fragment_size; - // Optional compression statistics - std::shared_ptr _compression_stats; - // write V2 page headers? - bool _v2_page_headers = false; - // Which columns in _table are used for sorting - std::optional> _sorting_columns; + friend parquet_writer_options_builder; /** - * @brief Constructor from sink. + * @brief Constructor from sink and table. * - * @param sink Sink used for writer output + * @param sink The sink used for writer output + * @param table Table to be written to output */ - explicit chunked_parquet_writer_options(sink_info const& sink) : _sink(sink) {} - - friend chunked_parquet_writer_options_builder; + explicit parquet_writer_options(sink_info const& sink, table_view const& table); public: /** @@ -1336,277 +1144,160 @@ class chunked_parquet_writer_options { * * This has been added since Cython requires a default constructor to create objects on stack. */ - chunked_parquet_writer_options() = default; + parquet_writer_options() = default; /** - * @brief Returns sink info. + * @brief Create builder to create `parquet_writer_options`. * - * @return Sink info + * @param sink The sink used for writer output + * @param table Table to be written to output + * + * @return Builder to build parquet_writer_options */ - [[nodiscard]] sink_info const& get_sink() const { return _sink; } + static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table); /** - * @brief Returns compression format used. + * @brief Create builder to create `parquet_writer_options`. * - * @return Compression format + * @return parquet_writer_options_builder */ - [[nodiscard]] compression_type get_compression() const { return _compression; } + static parquet_writer_options_builder builder(); /** - * @brief Returns level of statistics requested in output file. + * @brief Returns table_view. * - * @return Level of statistics requested in output file + * @return Table view */ - [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; } + [[nodiscard]] table_view get_table() const { return _table; } /** - * @brief Returns metadata information. + * @brief Returns partitions. * - * @return Metadata information + * @return Partitions */ - [[nodiscard]] auto const& get_metadata() const { return _metadata; } + [[nodiscard]] std::vector const& get_partitions() const { return _partitions; } /** - * @brief Returns Key-Value footer metadata information. + * @brief Returns Column chunks file paths to be set in the raw output metadata. * - * @return Key-Value footer metadata information + * @return Column chunks file paths to be set in the raw output metadata */ - std::vector> const& get_key_value_metadata() const + [[nodiscard]] std::vector const& get_column_chunks_file_paths() const { - return _user_data; - } - - /** - * @brief Returns `true` if timestamps will be written as INT96 - * - * @return `true` if timestamps will be written as INT96 - */ - bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } - - /** - * @brief Returns `true` if timestamps will be written as UTC - * - * @return `true` if timestamps will be written as UTC - */ - [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } - - /** - * @brief Returns maximum row group size, in bytes. - * - * @return Maximum row group size, in bytes - */ - auto get_row_group_size_bytes() const { return _row_group_size_bytes; } - - /** - * @brief Returns maximum row group size, in rows. - * - * @return Maximum row group size, in rows - */ - auto get_row_group_size_rows() const { return _row_group_size_rows; } - - /** - * @brief Returns maximum uncompressed page size, in bytes. - * - * If set larger than the row group size, then this will return the - * row group size. - * - * @return Maximum uncompressed page size, in bytes - */ - auto get_max_page_size_bytes() const - { - return std::min(_max_page_size_bytes, get_row_group_size_bytes()); - } - - /** - * @brief Returns maximum page size, in rows. - * - * If set larger than the row group size, then this will return the row group size. - * - * @return Maximum page size, in rows - */ - auto get_max_page_size_rows() const - { - return std::min(_max_page_size_rows, get_row_group_size_rows()); - } - - /** - * @brief Returns maximum length of min or max values in column index, in bytes. - * - * @return length min/max will be truncated to - */ - auto get_column_index_truncate_length() const { return _column_index_truncate_length; } - - /** - * @brief Returns policy for dictionary use. - * - * @return policy for dictionary use - */ - [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; } - - /** - * @brief Returns maximum dictionary size, in bytes. - * - * @return Maximum dictionary size, in bytes. - */ - [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; } - - /** - * @brief Returns maximum page fragment size, in rows. - * - * @return Maximum page fragment size, in rows. - */ - [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; } - - /** - * @brief Returns a shared pointer to the user-provided compression statistics. - * - * @return Compression statistics - */ - [[nodiscard]] std::shared_ptr get_compression_statistics() const - { - return _compression_stats; + return _column_chunks_file_paths; } /** - * @brief Returns `true` if V2 page headers should be written. - * - * @return `true` if V2 page headers should be written. - */ - [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; } - - /** - * @brief Returns the sorting_columns. - * - * @return Column sort order metadata - */ - [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; } - - /** - * @brief Sets metadata. - * - * @param metadata Associated metadata - */ - void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } - - /** - * @brief Sets Key-Value footer metadata. - * - * @param metadata Key-Value footer metadata - */ - void set_key_value_metadata(std::vector> metadata); - - /** - * @brief Sets the level of statistics in parquet_writer_options. - * - * @param sf Level of statistics requested in the output file - */ - void set_stats_level(statistics_freq sf) { _stats_level = sf; } - - /** - * @brief Sets compression type. - * - * @param compression The compression type to use - */ - void set_compression(compression_type compression) { _compression = compression; } - - /** - * @brief Sets timestamp writing preferences. - * - * INT96 timestamps will be written if `true` and TIMESTAMP_MICROS will be written if `false`. + * @brief Sets partitions. * - * @param req Boolean value to enable/disable writing of INT96 timestamps + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info */ - void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + void set_partitions(std::vector partitions); /** - * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. + * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param val Boolean value to enable/disable writing of timestamps as UTC. + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks in sink info */ - void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + void set_column_chunks_file_paths(std::vector file_paths); +}; +/** + * @brief Class to build `parquet_writer_options`. + */ +class parquet_writer_options_builder + : public parquet_writer_options_builder_base { + public: /** - * @brief Sets the maximum row group size, in bytes. + * @brief Default constructor. * - * @param size_bytes Maximum row group size, in bytes to set + * This has been added since Cython requires a default constructor to create objects on stack. */ - void set_row_group_size_bytes(size_t size_bytes); + explicit parquet_writer_options_builder() = default; /** - * @brief Sets the maximum row group size, in rows. + * @brief Constructor from sink and table. * - * @param size_rows The maximum row group size, in rows to set + * @param sink The sink used for writer output + * @param table Table to be written to output */ - void set_row_group_size_rows(size_type size_rows); + explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table); /** - * @brief Sets the maximum uncompressed page size, in bytes. + * @brief Sets partitions in parquet_writer_options. * - * @param size_bytes Maximum uncompressed page size, in bytes to set + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info + * @return this for chaining */ - void set_max_page_size_bytes(size_t size_bytes); + parquet_writer_options_builder& partitions(std::vector partitions); /** - * @brief Sets the maximum page size, in rows. + * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param size_rows The maximum page size, in rows to set + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks + * @return this for chaining */ - void set_max_page_size_rows(size_type size_rows); + parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths); +}; - /** - * @brief Sets the maximum length of min or max values in column index, in bytes. - * - * @param size_bytes length min/max will be truncated to - */ - void set_column_index_truncate_length(int32_t size_bytes); +/** + * @brief Writes a set of columns to parquet format. + * + * The following code snippet demonstrates how to write columns to a file: + * @code + * auto destination = cudf::io::sink_info("dataset.parquet"); + * auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); + * cudf::io::write_parquet(options); + * @endcode + * + * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if + * requested in parquet_writer_options (empty blob otherwise). + */ - /** - * @brief Sets the policy for dictionary use. - * - * @param policy Policy for dictionary use - */ - void set_dictionary_policy(dictionary_policy policy); +std::unique_ptr> write_parquet( + parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); - /** - * @brief Sets the maximum dictionary size, in bytes. - * - * @param size_bytes Maximum dictionary size, in bytes - */ - void set_max_dictionary_size(size_t size_bytes); +/** + * @brief Merges multiple raw metadata blobs that were previously created by write_parquet + * into a single metadata blob. + * + * @ingroup io_writers + * + * @param[in] metadata_list List of input file metadata + * @return A parquet-compatible blob that contains the data for all row groups in the list + */ +std::unique_ptr> merge_row_group_metadata( + std::vector>> const& metadata_list); - /** - * @brief Sets the maximum page fragment size, in rows. - * - * @param size_rows Maximum page fragment size, in rows. - */ - void set_max_page_fragment_size(size_type size_rows); +class chunked_parquet_writer_options_builder; +/** + * @brief Settings for `parquet_chunked_writer`. + */ +class chunked_parquet_writer_options : public parquet_writer_options_base { /** - * @brief Sets the pointer to the output compression statistics. + * @brief Constructor from sink. * - * @param comp_stats Pointer to compression statistics to be updated after writing + * @param sink Sink used for writer output */ - void set_compression_statistics(std::shared_ptr comp_stats) - { - _compression_stats = std::move(comp_stats); - } + explicit chunked_parquet_writer_options(sink_info const& sink); - /** - * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. - * - * @param val Boolean value to enable/disable writing of V2 page headers. - */ - void enable_write_v2_headers(bool val) { _v2_page_headers = val; } + friend chunked_parquet_writer_options_builder; + public: /** - * @brief Sets sorting columns. + * @brief Default constructor. * - * @param sorting_columns Column sort order metadata + * This has been added since Cython requires a default constructor to create objects on stack. */ - void set_sorting_columns(std::vector sorting_columns) - { - _sorting_columns = std::move(sorting_columns); - } + chunked_parquet_writer_options() = default; /** * @brief creates builder to build chunked_parquet_writer_options. @@ -1619,11 +1310,11 @@ class chunked_parquet_writer_options { }; /** - * @brief Builds options for chunked_parquet_writer_options. + * @brief Class to build `chunked_parquet_writer_options`. */ -class chunked_parquet_writer_options_builder { - chunked_parquet_writer_options options; - +class chunked_parquet_writer_options_builder + : public parquet_writer_options_builder_base { public: /** * @brief Default constructor. @@ -1637,238 +1328,7 @@ class chunked_parquet_writer_options_builder { * * @param sink The sink used for writer output */ - chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){}; - - /** - * @brief Sets metadata to chunked_parquet_writer_options. - * - * @param metadata Associated metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& metadata(table_input_metadata metadata) - { - options._metadata = std::move(metadata); - return *this; - } - - /** - * @brief Sets Key-Value footer metadata in parquet_writer_options. - * - * @param metadata Key-Value footer metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& key_value_metadata( - std::vector> metadata); - - /** - * @brief Sets the level of statistics in chunked_parquet_writer_options. - * - * @param sf Level of statistics requested in the output file - * @return this for chaining - */ - chunked_parquet_writer_options_builder& stats_level(statistics_freq sf) - { - options._stats_level = sf; - return *this; - } - - /** - * @brief Sets compression type to chunked_parquet_writer_options. - * - * @param compression The compression type to use - * @return this for chaining - */ - chunked_parquet_writer_options_builder& compression(compression_type compression) - { - options._compression = compression; - return *this; - } - - /** - * @brief Set to true if timestamps should be written as - * int96 types instead of int64 types. Even though int96 is deprecated and is - * not an internal type for cudf, it needs to be written for backwards - * compatibility reasons. - * - * @param enabled Boolean value to enable/disable int96 timestamps - * @return this for chaining - */ - chunked_parquet_writer_options_builder& int96_timestamps(bool enabled) - { - options._write_timestamps_as_int96 = enabled; - return *this; - } - - /** - * @brief Set to true if timestamps are to be written as UTC. - * - * @param enabled Boolean value to enable/disable writing of timestamps as UTC. - * @return this for chaining - */ - chunked_parquet_writer_options_builder& utc_timestamps(bool enabled) - { - options._write_timestamps_as_UTC = enabled; - return *this; - } - - /** - * @brief Set to true if V2 page headers are to be written. - * - * @param enabled Boolean value to enable/disable writing of V2 page headers. - * @return this for chaining - */ - chunked_parquet_writer_options_builder& write_v2_headers(bool enabled); - - /** - * @brief Sets the maximum row group size, in bytes. - * - * @param val maximum row group size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val) - { - options.set_row_group_size_bytes(val); - return *this; - } - - /** - * @brief Sets the maximum number of rows in output row groups. - * - * @param val maximum number or rows - * @return this for chaining - */ - chunked_parquet_writer_options_builder& row_group_size_rows(size_type val) - { - options.set_row_group_size_rows(val); - return *this; - } - - /** - * @brief Sets the maximum uncompressed page size, in bytes. - * - * Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be - * larger than the row group size in bytes, and will be adjusted to match if it is. - * - * @param val maximum page size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_size_bytes(size_t val) - { - options.set_max_page_size_bytes(val); - return *this; - } - - /** - * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. - * Cannot be larger than the row group size in rows, and will be adjusted to match if it is. - * - * @param val maximum rows per page - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_size_rows(size_type val) - { - options.set_max_page_size_rows(val); - return *this; - } - - /** - * @brief Sets the desired maximum size in bytes for min and max values in the column index. - * - * Values exceeding this limit will be truncated, but modified such that they will still - * be valid lower and upper bounds. This only applies to variable length types, such as string. - * Maximum values will not be truncated if there is no suitable truncation that results in - * a valid upper bound. - * - * Default value is 64. - * - * @param val length min/max will be truncated to, with 0 indicating no truncation - * @return this for chaining - */ - chunked_parquet_writer_options_builder& column_index_truncate_length(int32_t val) - { - options.set_column_index_truncate_length(val); - return *this; - } - - /** - * @brief Sets the policy for dictionary use. - * - * Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can - * be compressed. In some circumstances, the dictionary can grow beyond this limit, which - * will prevent the column from being compressed. This setting controls how the writer - * should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable - * dictionary encoding for columns where the dictionary exceeds the limit. A setting of - * dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of - * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in - * the disabling of compression for columns that would otherwise be compressed. - * - * The default value is dictionary_policy::ADAPTIVE. - * - * @param val policy for dictionary use - * @return this for chaining - */ - chunked_parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val); - - /** - * @brief Sets the maximum dictionary size, in bytes. - * - * Disables dictionary encoding for any column chunk where the dictionary will - * exceed this limit. Only used when the dictionary_policy is set to 'ADAPTIVE'. - * - * Default value is 1048576 (1MiB). - * - * @param val maximum dictionary size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_dictionary_size(size_t val); - - /** - * @brief Sets the maximum page fragment size, in rows. - * - * Files with nested schemas or very long strings may need a page fragment size - * smaller than the default value of 5000 to ensure a single fragment will not - * exceed the desired maximum page size in bytes. - * - * @param val maximum page fragment size - * @return this for chaining - */ - chunked_parquet_writer_options_builder& max_page_fragment_size(size_type val); - - /** - * @brief Sets the pointer to the output compression statistics. - * - * @param comp_stats Pointer to compression statistics to be filled once writer is done - * @return this for chaining - */ - chunked_parquet_writer_options_builder& compression_statistics( - std::shared_ptr const& comp_stats) - { - options._compression_stats = comp_stats; - return *this; - } - - /** - * @brief Sets column sorting metadata to chunked_parquet_writer_options. - * - * @param sorting_columns Column sort order metadata - * @return this for chaining - */ - chunked_parquet_writer_options_builder& sorting_columns( - std::vector sorting_columns); - - /** - * @brief move chunked_parquet_writer_options member once it's built. - */ - operator chunked_parquet_writer_options&&() { return std::move(options); } - - /** - * @brief move chunked_parquet_writer_options member once it's is built. - * - * This has been added since Cython does not support overloading of conversion operators. - * - * @return Built `chunked_parquet_writer_options` object's r-value reference - */ - chunked_parquet_writer_options&& build() { return std::move(options); } + chunked_parquet_writer_options_builder(sink_info const& sink); }; /** diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 3ba2facf276..1ed8ee5ce06 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -115,7 +115,7 @@ parquet_writer_options_builder parquet_writer_options::builder() chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( sink_info const& sink) { - return chunked_parquet_writer_options_builder(sink); + return chunked_parquet_writer_options_builder{sink}; } namespace { @@ -740,29 +740,37 @@ void parquet_reader_options::set_num_rows(size_type val) _num_rows = val; } -void parquet_writer_options::set_partitions(std::vector partitions) +void parquet_writer_options_base::set_metadata(table_input_metadata metadata) { - CUDF_EXPECTS(partitions.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of partitions"); - _partitions = std::move(partitions); + _metadata = std::move(metadata); } -void parquet_writer_options::set_key_value_metadata( +void parquet_writer_options_base::set_key_value_metadata( std::vector> metadata) { - CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), + CUDF_EXPECTS(metadata.size() == get_sink().num_sinks(), "Mismatch between number of sinks and number of metadata maps"); _user_data = std::move(metadata); } -void parquet_writer_options::set_column_chunks_file_paths(std::vector file_paths) +void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_level = sf; } + +void parquet_writer_options_base::set_compression(compression_type compression) { - CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of chunk paths to set"); - _column_chunks_file_paths = std::move(file_paths); + _compression = compression; +} + +void parquet_writer_options_base::enable_int96_timestamps(bool req) +{ + _write_timestamps_as_int96 = req; +} + +void parquet_writer_options_base::enable_utc_timestamps(bool val) +{ + _write_timestamps_as_UTC = val; } -void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) +void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes) { CUDF_EXPECTS( size_bytes >= 1024, @@ -770,13 +778,13 @@ void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) _row_group_size_bytes = size_bytes; } -void parquet_writer_options::set_row_group_size_rows(size_type size_rows) +void parquet_writer_options_base::set_row_group_size_rows(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer."); _row_group_size_rows = size_rows; } -void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) +void parquet_writer_options_base::set_max_page_size_bytes(size_t size_bytes) { CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB."); CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), @@ -784,190 +792,249 @@ void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) _max_page_size_bytes = size_bytes; } -void parquet_writer_options::set_max_page_size_rows(size_type size_rows) +void parquet_writer_options_base::set_max_page_size_rows(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer."); _max_page_size_rows = size_rows; } -void parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes) +void parquet_writer_options_base::set_column_index_truncate_length(int32_t size_bytes) { CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative."); _column_index_truncate_length = size_bytes; } -void parquet_writer_options::set_dictionary_policy(dictionary_policy policy) +void parquet_writer_options_base::set_dictionary_policy(dictionary_policy policy) { _dictionary_policy = policy; } -void parquet_writer_options::set_max_dictionary_size(size_t size_bytes) +void parquet_writer_options_base::set_max_dictionary_size(size_t size_bytes) { CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), "The maximum dictionary size cannot exceed 2GB."); _max_dictionary_size = size_bytes; } -void parquet_writer_options::set_max_page_fragment_size(size_type size_rows) +void parquet_writer_options_base::set_max_page_fragment_size(size_type size_rows) { CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer."); _max_page_fragment_size = size_rows; } -parquet_writer_options_builder& parquet_writer_options_builder::partitions( - std::vector partitions) +void parquet_writer_options_base::set_compression_statistics( + std::shared_ptr comp_stats) { - options.set_partitions(std::move(partitions)); - return *this; + _compression_stats = std::move(comp_stats); +} + +void parquet_writer_options_base::enable_write_v2_headers(bool val) { _v2_page_headers = val; } + +void parquet_writer_options_base::set_sorting_columns(std::vector sorting_columns) +{ + _sorting_columns = std::move(sorting_columns); +} + +parquet_writer_options::parquet_writer_options(sink_info const& sink, table_view const& table) + : parquet_writer_options_base(sink), _table(table) +{ +} + +void parquet_writer_options::set_partitions(std::vector partitions) +{ + CUDF_EXPECTS(partitions.size() == get_sink().num_sinks(), + "Mismatch between number of sinks and number of partitions"); + _partitions = std::move(partitions); +} + +void parquet_writer_options::set_column_chunks_file_paths(std::vector file_paths) +{ + CUDF_EXPECTS(file_paths.size() == get_sink().num_sinks(), + "Mismatch between number of sinks and number of chunk paths to set"); + _column_chunks_file_paths = std::move(file_paths); +} + +template +parquet_writer_options_builder_base::parquet_writer_options_builder_base( + OptionsT options) + : _options(std::move(options)) +{ +} + +template +BuilderT& parquet_writer_options_builder_base::metadata( + table_input_metadata metadata) +{ + _options.set_metadata(std::move(metadata)); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::key_value_metadata( +template +BuilderT& parquet_writer_options_builder_base::key_value_metadata( std::vector> metadata) { - options.set_key_value_metadata(std::move(metadata)); - return *this; + _options.set_key_value_metadata(std::move(metadata)); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths( - std::vector file_paths) +template +BuilderT& parquet_writer_options_builder_base::stats_level(statistics_freq sf) { - options.set_column_chunks_file_paths(std::move(file_paths)); - return *this; + _options.set_stats_level(sf); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::dictionary_policy( - enum dictionary_policy val) +template +BuilderT& parquet_writer_options_builder_base::compression( + compression_type compression) { - options.set_dictionary_policy(val); - return *this; + _options.set_compression(compression); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::max_dictionary_size(size_t val) +template +BuilderT& parquet_writer_options_builder_base::row_group_size_bytes(size_t val) { - options.set_max_dictionary_size(val); - return *this; + _options.set_row_group_size_bytes(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::max_page_fragment_size( +template +BuilderT& parquet_writer_options_builder_base::row_group_size_rows( size_type val) { - options.set_max_page_fragment_size(val); - return *this; + _options.set_row_group_size_rows(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers(bool enabled) +template +BuilderT& parquet_writer_options_builder_base::max_page_size_bytes(size_t val) { - options.enable_write_v2_headers(enabled); - return *this; + _options.set_max_page_size_bytes(val); + return static_cast(*this); } -parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns( - std::vector sorting_columns) +template +BuilderT& parquet_writer_options_builder_base::max_page_size_rows(size_type val) { - options._sorting_columns = std::move(sorting_columns); - return *this; + _options.set_max_page_size_rows(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_key_value_metadata( - std::vector> metadata) +template +BuilderT& parquet_writer_options_builder_base::column_index_truncate_length( + int32_t val) { - CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), - "Mismatch between number of sinks and number of metadata maps"); - _user_data = std::move(metadata); + _options.set_column_index_truncate_length(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_row_group_size_bytes(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::dictionary_policy( + enum dictionary_policy val) { - CUDF_EXPECTS( - size_bytes >= 1024, - "The maximum row group size cannot be smaller than the minimum page size, which is 1KB."); - _row_group_size_bytes = size_bytes; + _options.set_dictionary_policy(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_row_group_size_rows(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::max_dictionary_size(size_t val) { - CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer."); - _row_group_size_rows = size_rows; + _options.set_max_dictionary_size(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_size_bytes(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::max_page_fragment_size( + size_type val) { - CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB."); - CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), - "The maximum page size cannot exceed 2GB."); - _max_page_size_bytes = size_bytes; + _options.set_max_page_fragment_size(val); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_size_rows(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::compression_statistics( + std::shared_ptr const& comp_stats) { - CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer."); - _max_page_size_rows = size_rows; + _options.set_compression_statistics(comp_stats); + return static_cast(*this); } -void chunked_parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::int96_timestamps(bool enabled) { - CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative."); - _column_index_truncate_length = size_bytes; + _options.enable_int96_timestamps(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_dictionary_policy(dictionary_policy policy) +template +BuilderT& parquet_writer_options_builder_base::utc_timestamps(bool enabled) { - _dictionary_policy = policy; + _options.enable_utc_timestamps(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_dictionary_size(size_t size_bytes) +template +BuilderT& parquet_writer_options_builder_base::write_v2_headers(bool enabled) { - CUDF_EXPECTS(size_bytes <= static_cast(std::numeric_limits::max()), - "The maximum dictionary size cannot exceed 2GB."); - _max_dictionary_size = size_bytes; + _options.enable_write_v2_headers(enabled); + return static_cast(*this); } -void chunked_parquet_writer_options::set_max_page_fragment_size(size_type size_rows) +template +BuilderT& parquet_writer_options_builder_base::sorting_columns( + std::vector sorting_columns) { - CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer."); - _max_page_fragment_size = size_rows; + _options.set_sorting_columns(std::move(sorting_columns)); + return static_cast(*this); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::key_value_metadata( - std::vector> metadata) +template +parquet_writer_options_builder_base::operator OptionsT&&() { - options.set_key_value_metadata(std::move(metadata)); - return *this; + return std::move(_options); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::dictionary_policy( - enum dictionary_policy val) +template +OptionsT&& parquet_writer_options_builder_base::build() { - options.set_dictionary_policy(val); - return *this; + return std::move(_options); } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::max_dictionary_size( - size_t val) +template class parquet_writer_options_builder_base; +template class parquet_writer_options_builder_base; + +parquet_writer_options_builder::parquet_writer_options_builder(sink_info const& sink, + table_view const& table) + : parquet_writer_options_builder_base(parquet_writer_options{sink, table}) { - options.set_max_dictionary_size(val); - return *this; } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::write_v2_headers( - bool enabled) +parquet_writer_options_builder& parquet_writer_options_builder::partitions( + std::vector partitions) { - options.enable_write_v2_headers(enabled); + get_options().set_partitions(std::move(partitions)); return *this; } -chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns( - std::vector sorting_columns) +parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths( + std::vector file_paths) { - options._sorting_columns = std::move(sorting_columns); + get_options().set_column_chunks_file_paths(std::move(file_paths)); return *this; } -chunked_parquet_writer_options_builder& -chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val) +chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info const& sink) + : parquet_writer_options_base(sink) +{ +} + +chunked_parquet_writer_options_builder::chunked_parquet_writer_options_builder( + sink_info const& sink) + : parquet_writer_options_builder_base(chunked_parquet_writer_options{sink}) { - options.set_max_page_fragment_size(val); - return *this; } } // namespace cudf::io diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index fb98650308a..36654457995 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -66,24 +66,19 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cdef cudf_io_types.table_with_metadata read_parquet( parquet_reader_options args) except + - cdef cppclass parquet_writer_options: - parquet_writer_options() except + + cdef cppclass parquet_writer_options_base: + parquet_writer_options_base() except + cudf_io_types.sink_info get_sink_info() except + cudf_io_types.compression_type get_compression() except + cudf_io_types.statistics_freq get_stats_level() except + - cudf_table_view.table_view get_table() except + const optional[cudf_io_types.table_input_metadata]& get_metadata( ) except + - string get_column_chunks_file_paths() except + size_t get_row_group_size_bytes() except + size_type get_row_group_size_rows() except + size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + size_t get_max_dictionary_size() except + - void set_partitions( - vector[cudf_io_types.partition_info] partitions - ) except + void set_metadata( cudf_io_types.table_input_metadata m ) except + @@ -96,9 +91,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_compression( cudf_io_types.compression_type compression ) except + - void set_column_chunks_file_paths( - vector[string] column_chunks_file_paths - ) except + void set_int96_timestamps( bool enabled ) except + @@ -113,161 +105,112 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void enable_write_v2_headers(bool val) except + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + + cdef cppclass parquet_writer_options(parquet_writer_options_base): + parquet_writer_options() except + + cudf_table_view.table_view get_table() except + + string get_column_chunks_file_paths() except + + void set_partitions( + vector[cudf_io_types.partition_info] partitions + ) except + + void set_column_chunks_file_paths( + vector[string] column_chunks_file_paths + ) except + + @staticmethod parquet_writer_options_builder builder( cudf_io_types.sink_info sink_, cudf_table_view.table_view table_ ) except + - cdef cppclass parquet_writer_options_builder: - + cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]: parquet_writer_options_builder() except + - parquet_writer_options_builder( - cudf_io_types.sink_info sink_, - cudf_table_view.table_view table_ - ) except + - parquet_writer_options_builder& partitions( - vector[cudf_io_types.partition_info] partitions - ) except + - parquet_writer_options_builder& metadata( + + BuilderT& metadata( cudf_io_types.table_input_metadata m ) except + - parquet_writer_options_builder& key_value_metadata( + BuilderT& key_value_metadata( vector[map[string, string]] kvm ) except + - parquet_writer_options_builder& stats_level( + BuilderT& stats_level( cudf_io_types.statistics_freq sf ) except + - parquet_writer_options_builder& compression( + BuilderT& compression( cudf_io_types.compression_type compression ) except + - parquet_writer_options_builder& column_chunks_file_paths( - vector[string] column_chunks_file_paths - ) except + - parquet_writer_options_builder& int96_timestamps( + BuilderT& int96_timestamps( bool enabled ) except + - parquet_writer_options_builder& utc_timestamps( + BuilderT& utc_timestamps( bool enabled ) except + - parquet_writer_options_builder& row_group_size_bytes( + BuilderT& row_group_size_bytes( size_t val ) except + - parquet_writer_options_builder& row_group_size_rows( + BuilderT& row_group_size_rows( size_type val ) except + - parquet_writer_options_builder& max_page_size_bytes( + BuilderT& max_page_size_bytes( size_t val ) except + - parquet_writer_options_builder& max_page_size_rows( + BuilderT& max_page_size_rows( size_type val ) except + - parquet_writer_options_builder& max_dictionary_size( + BuilderT& max_dictionary_size( size_t val ) except + - parquet_writer_options_builder& write_v2_headers( + BuilderT& write_v2_headers( bool val ) except + - parquet_writer_options_builder& dictionary_policy( + BuilderT& dictionary_policy( cudf_io_types.dictionary_policy val ) except + + # FIXME: the following two functions actually belong in + # parquet_writer_options_builder, but placing them there yields a + # "'parquet_writer_options_builder' is not a type identifier" error. + # This is probably a bug in cython since a simpler CRTP example that + # has methods returning references to a child class seem to work. + # Calling these from the chunked options builder will fail at compile + # time, so this should be safe. + # NOTE: these two are never actually called from libcudf. Instead these + # properties are set in the options after calling build(), so perhaps + # they can be removed. + BuilderT& partitions( + vector[cudf_io_types.partition_info] partitions + ) except + + BuilderT& column_chunks_file_paths( + vector[string] column_chunks_file_paths + ) except + + OptionsT build() except + - parquet_writer_options build() except + + cdef cppclass parquet_writer_options_builder( + parquet_writer_options_builder_base[parquet_writer_options_builder, + parquet_writer_options]): + parquet_writer_options_builder() except + + parquet_writer_options_builder( + cudf_io_types.sink_info sink_, + cudf_table_view.table_view table_ + ) except + cdef unique_ptr[vector[uint8_t]] write_parquet( parquet_writer_options args ) except + - cdef cppclass chunked_parquet_writer_options: + cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base): chunked_parquet_writer_options() except + - cudf_io_types.sink_info get_sink() except + - cudf_io_types.compression_type get_compression() except + - cudf_io_types.statistics_freq get_stats_level() except + - const optional[cudf_io_types.table_input_metadata]& get_metadata( - ) except + - size_t get_row_group_size_bytes() except + - size_type get_row_group_size_rows() except + - size_t get_max_page_size_bytes() except + - size_type get_max_page_size_rows() except + - size_t get_max_dictionary_size() except + - - void set_metadata( - cudf_io_types.table_input_metadata m - ) except + - void set_key_value_metadata( - vector[map[string, string]] kvm - ) except + - void set_stats_level( - cudf_io_types.statistics_freq sf - ) except + - void set_compression( - cudf_io_types.compression_type compression - ) except + - void set_int96_timestamps( - bool enabled - ) except + - void set_utc_timestamps( - bool enabled - ) except + - void set_row_group_size_bytes(size_t val) except + - void set_row_group_size_rows(size_type val) except + - void set_max_page_size_bytes(size_t val) except + - void set_max_page_size_rows(size_type val) except + - void set_max_dictionary_size(size_t val) except + - void enable_write_v2_headers(bool val) except + - void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + @staticmethod chunked_parquet_writer_options_builder builder( cudf_io_types.sink_info sink_, ) except + - cdef cppclass chunked_parquet_writer_options_builder: + cdef cppclass chunked_parquet_writer_options_builder( + parquet_writer_options_builder_base[chunked_parquet_writer_options_builder, + chunked_parquet_writer_options] + ): chunked_parquet_writer_options_builder() except + chunked_parquet_writer_options_builder( cudf_io_types.sink_info sink_, ) except + - chunked_parquet_writer_options_builder& metadata( - cudf_io_types.table_input_metadata m - ) except + - chunked_parquet_writer_options_builder& key_value_metadata( - vector[map[string, string]] kvm - ) except + - chunked_parquet_writer_options_builder& stats_level( - cudf_io_types.statistics_freq sf - ) except + - chunked_parquet_writer_options_builder& compression( - cudf_io_types.compression_type compression - ) except + - chunked_parquet_writer_options_builder& int96_timestamps( - bool enabled - ) except + - chunked_parquet_writer_options_builder& utc_timestamps( - bool enabled - ) except + - chunked_parquet_writer_options_builder& row_group_size_bytes( - size_t val - ) except + - chunked_parquet_writer_options_builder& row_group_size_rows( - size_type val - ) except + - chunked_parquet_writer_options_builder& max_page_size_bytes( - size_t val - ) except + - chunked_parquet_writer_options_builder& max_page_size_rows( - size_type val - ) except + - chunked_parquet_writer_options_builder& max_dictionary_size( - size_t val - ) except + - parquet_writer_options_builder& write_v2_headers( - bool val - ) except + - parquet_writer_options_builder& dictionary_policy( - cudf_io_types.dictionary_policy val - ) except + - - chunked_parquet_writer_options build() except + cdef cppclass parquet_chunked_writer: parquet_chunked_writer() except + From ae12634c834a82d3d8884110c9de07d91877c828 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:51:28 -0400 Subject: [PATCH 35/48] Fix large strings handling in nvtext::character_tokenize (#15829) Fix logic for `nvtext::character_tokenize` to handle large strings input. The output for > 2GB input strings column will turn characters into rows and so will likely overflow the `size_type` rows as expected. The `thrust::count_if` is replaced with a raw kernel to produce the appropriate count that can be checked against max row size. Also changed the API to not accept null rows since the code does not check for them and can return invalid results for inputs with unsanitized-null rows. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15829 --- cpp/benchmarks/text/tokenize.cpp | 6 +- cpp/include/nvtext/tokenize.hpp | 3 +- cpp/src/text/tokenize.cu | 66 ++++++++++++++----- cpp/tests/text/tokenize_tests.cpp | 10 +-- python/cudf/cudf/core/column/string.py | 13 ++-- .../cudf/cudf/tests/text/test_text_methods.py | 2 - 6 files changed, 66 insertions(+), 34 deletions(-) diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 2151b28d637..e83310e0343 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -39,8 +39,10 @@ static void bench_tokenize(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index ea1b9c716f0..29fed0759c7 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -176,7 +176,8 @@ std::unique_ptr count_tokens( * t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"] * @endcode * - * All null row entries are ignored and the output contains all valid rows. + * @throw std::invalid_argument if `input` contains nulls + * @throw std::overflow_error if the output would produce more than max size_type rows * * @param input Strings column to tokenize * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 0b16305a81a..25406bce759 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include +#include #include #include #include @@ -99,6 +101,31 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr); } +constexpr int64_t block_size = 512; // number of threads per block +constexpr int64_t bytes_per_thread = 4; // bytes processed per thread + +CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const byte_idx = static_cast(idx) * bytes_per_thread; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + int64_t count = 0; + // each thread processes multiple bytes + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { + count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]); + } + auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum()); + + if ((lane_idx == 0) && (total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(total, cuda::std::memory_order_relaxed); + } +} + } // namespace // detail APIs @@ -176,11 +203,17 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto offsets = strings_column.offsets(); - auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); - auto chars_bytes = cudf::strings::detail::get_offset_value( - offsets, strings_column.offset() + strings_count, stream) - - offset; + CUDF_EXPECTS( + strings_column.null_count() == 0, "input must not contain nulls", std::invalid_argument); + + auto const offsets = strings_column.offsets(); + auto const offset = + cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream); + auto const chars_bytes = cudf::strings::detail::get_offset_value( + offsets, strings_column.offset() + strings_count, stream) - + offset; + // no bytes -- this could happen in an all-empty column + if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } auto d_chars = strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; @@ -188,23 +221,26 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - cudf::size_type num_characters = thrust::count_if( - rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) { - return cudf::strings::detail::is_begin_utf8_char(byte); - }); + rmm::device_scalar d_count(0, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), + block_size); + count_characters<<>>( + d_chars, chars_bytes, d_count.data()); + auto const num_characters = d_count.value(stream); - // no characters check -- this could happen in all-empty or all-null strings column - if (num_characters == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - } + // number of characters becomes the number of rows so need to check the row limit + CUDF_EXPECTS( + num_characters + 1 < static_cast(std::numeric_limits::max()), + "output exceeds the column size limit", + std::overflow_error); // create output offsets column - // -- conditionally copy a counting iterator where - // the first byte of each character is located auto offsets_column = cudf::make_numeric_column( offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_new_offsets = cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view()); + // offsets are at the beginning byte of each character cudf::detail::copy_if_safe( thrust::counting_iterator(0), thrust::counting_iterator(chars_bytes + 1), diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index 6a6bcda87cc..a59a54169d7 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -111,17 +111,13 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest) TEST_F(TextTokenizeTest, CharacterTokenize) { - std::vector h_strings{"the mousé ate the cheese", nullptr, ""}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""}); cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s", "é", " ", "a", "t", "e", " ", "t", "h", "e", " ", "c", "h", "e", "e", "s", "e"}; - auto results = nvtext::character_tokenize(cudf::strings_column_view(strings)); + auto results = nvtext::character_tokenize(cudf::strings_column_view(input)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -151,8 +147,6 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::character_tokenize(all_empty); EXPECT_EQ(results->size(), 0); - results = nvtext::character_tokenize(all_null); - EXPECT_EQ(results->size(), 0); auto const delimiter = cudf::string_scalar{""}; results = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter); EXPECT_EQ(results->size(), 0); diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d12aa80e9a3..ad7dbe5e52e 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -552,16 +552,17 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = libstrings.character_tokenize(self._column) + col = self._column.fillna("") # sanitize nulls + result_col = libstrings.character_tokenize(col) - offset_col = self._column.children[0] + offset_col = col.children[0] return cudf.core.column.ListColumn( - size=len(self._column), - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, + size=len(col), + dtype=cudf.ListDtype(col.dtype), + mask=col.mask, offset=0, - null_count=self._column.null_count, + null_count=0, children=(offset_col, result_col), ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 6bd3b99bae1..36f7f3de828 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -426,7 +426,6 @@ def test_character_tokenize_series(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" @@ -543,7 +542,6 @@ def test_character_tokenize_index(): [ "hello world", "sdf", - None, ( "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" From 9b2c35f346b91b598238cbf54e40a463820708c0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 10 Jun 2024 11:40:08 -0500 Subject: [PATCH 36/48] Support arbitrary CUDA versions in UDF code (#15950) This PR eliminates the manual mapping from PTX versions to CUDA versions, to help support CUDA 12.5 and newer without requiring a manual update to `_numba.py` for every CUDA release. This also updates the minimum compute capability PTX file from arch 60 to arch 70, since that is now the minimum required by RAPIDS. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Graham Markall (https://github.com/gmarkall) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/15950 --- .../_lib/pylibcudf/libcudf/strings_udf.pxd | 1 + python/cudf/cudf/_lib/strings_udf.pyx | 5 ++ python/cudf/cudf/utils/_numba.py | 84 +++---------------- python/cudf/udf_cpp/CMakeLists.txt | 2 +- .../include/cudf/strings/udf/udf_apis.hpp | 9 +- .../strings/src/strings/udf/udf_apis.cu | 2 + 6 files changed, 30 insertions(+), 73 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd index b895d5e6925..804ad30dfb1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd @@ -18,6 +18,7 @@ cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ "cudf::strings::udf" nogil: + cdef int get_cuda_build_version() except + cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + cdef unique_ptr[column] column_from_udf_string_array( udf_string* strings, size_type size, diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index e952492c45d..7610cad0b40 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -22,11 +22,16 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view from cudf._lib.pylibcudf.libcudf.strings_udf cimport ( column_from_udf_string_array as cpp_column_from_udf_string_array, free_udf_string_array as cpp_free_udf_string_array, + get_cuda_build_version as cpp_get_cuda_build_version, to_string_view_array as cpp_to_string_view_array, udf_string, ) +def get_cuda_build_version(): + return cpp_get_cuda_build_version() + + def column_to_string_view_array(Column strings_col): cdef unique_ptr[device_buffer] c_buffer cdef column_view input_view = strings_col.view() diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index 494b48b3cfd..d9dde58d998 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -12,16 +12,14 @@ # strings_udf. This is the easiest way to break an otherwise circular import # loop of _lib.*->cudautils->_numba->_lib.strings_udf @lru_cache -def _get_cc_60_ptx_file(): +def _get_cuda_build_version(): from cudf._lib import strings_udf - return os.path.join( - os.path.dirname(strings_udf.__file__), - "..", - "core", - "udf", - "shim_60.ptx", - ) + # The version is an integer, parsed as 1000 * major + 10 * minor + cuda_build_version = strings_udf.get_cuda_build_version() + cuda_major_version = cuda_build_version // 1000 + cuda_minor_version = (cuda_build_version % 1000) // 10 + return (cuda_major_version, cuda_minor_version) def _get_best_ptx_file(archs, max_compute_capability): @@ -38,8 +36,8 @@ def _get_best_ptx_file(archs, max_compute_capability): def _get_ptx_file(path, prefix): if "RAPIDS_NO_INITIALIZE" in os.environ: - # cc=60 ptx is always built - cc = int(os.environ.get("STRINGS_UDF_CC", "60")) + # cc=70 ptx is always built + cc = int(os.environ.get("STRINGS_UDF_CC", "70")) else: from numba import cuda @@ -120,15 +118,13 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptx_toolkit_version = _get_cuda_version_from_ptx_file( - _get_cc_60_ptx_file() - ) + shim_ptx_cuda_version = _get_cuda_build_version() # MVC is required whenever any PTX is newer than the driver - # This could be the shipped PTX file or the PTX emitted by - # the version of NVVM on the user system, the latter aligning - # with the runtime version - if (driver_version < ptx_toolkit_version) or ( + # This could be the shipped shim PTX file (determined by the CUDA + # version used at build time) or the PTX emitted by the version of NVVM + # on the user system (determined by the user's CUDA runtime version) + if (driver_version < shim_ptx_cuda_version) or ( driver_version < runtime_version ): if driver_version < (12, 0): @@ -139,60 +135,6 @@ def _setup_numba(): patch_numba_linker() -def _get_cuda_version_from_ptx_file(path): - """ - https://docs.nvidia.com/cuda/parallel-thread-execution/ - Each PTX module must begin with a .version - directive specifying the PTX language version - - example header: - // - // Generated by NVIDIA NVVM Compiler - // - // Compiler Build ID: CL-31057947 - // Cuda compilation tools, release 11.6, V11.6.124 - // Based on NVVM 7.0.1 - // - - .version 7.6 - .target sm_52 - .address_size 64 - - """ - with open(path) as ptx_file: - for line in ptx_file: - if line.startswith(".version"): - ver_line = line - break - else: - raise ValueError("Could not read CUDA version from ptx file.") - version = ver_line.strip("\n").split(" ")[1] - # This dictionary maps from supported versions of NVVM to the - # PTX version it produces. The lowest value should be the minimum - # CUDA version required to compile the library. Currently CUDA 11.5 - # or higher is required to build cudf. New CUDA versions should - # be added to this dictionary when officially supported. - ver_map = { - "7.5": (11, 5), - "7.6": (11, 6), - "7.7": (11, 7), - "7.8": (11, 8), - "8.0": (12, 0), - "8.1": (12, 1), - "8.2": (12, 2), - "8.3": (12, 3), - "8.4": (12, 4), - } - - cuda_ver = ver_map.get(version) - if cuda_ver is None: - raise ValueError( - f"Could not map PTX version {version} to a CUDA version" - ) - - return cuda_ver - - class _CUDFNumbaConfig: def __enter__(self): self.CUDA_LOW_OCCUPANCY_WARNINGS = ( diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index fe7f9d0b00d..fa7855cfc65 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -60,7 +60,7 @@ set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) # always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be # safely queried through a context -list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60") +list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "") diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp index 219dbe27682..8635b1280de 100644 --- a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp +++ b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,13 @@ namespace cudf { namespace strings { namespace udf { +/** + * @brief Get the CUDA version used at build time. + * + * @return The CUDA version as an integer, parsed as major * 1000 + minor * 10. + */ +int get_cuda_build_version(); + class udf_string; /** diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu index 9cf86b5ea48..941e61e6787 100644 --- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu +++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu @@ -101,6 +101,8 @@ void free_udf_string_array(cudf::strings::udf::udf_string* d_strings, // external APIs +int get_cuda_build_version() { return CUDA_VERSION; } + std::unique_ptr to_string_view_array(cudf::column_view const input) { return detail::to_string_view_array(input, cudf::get_default_stream()); From e3ba131baf340dfcf575abc99a872cdb36671307 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 06:48:41 -1000 Subject: [PATCH 37/48] Support timezone aware pandas inputs in cudf (#15935) closes #13611 (This technically does not support pandas objects have interval types that are timezone aware) @rjzamora let me know if the test I adapted from your PR in https://github.com/rapidsai/cudf/pull/15929 is adequate Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15935 --- python/cudf/cudf/core/column/column.py | 27 +++++-------------- python/cudf/cudf/core/index.py | 11 +++----- .../cudf/tests/series/test_datetimelike.py | 13 +++++++++ python/cudf/cudf/tests/test_datetime.py | 26 +++--------------- .../dask_cudf/io/tests/test_parquet.py | 20 ++++++++++++++ 5 files changed, 48 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 475d52d0fbb..f87797a1fa3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "yet supported in pyarrow, see: " "https://github.com/apache/arrow/issues/20213" ) - elif pa.types.is_timestamp(array.type) and array.type.tz is not None: - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) elif pa.types.is_large_string(array.type): @@ -992,9 +988,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: return col elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): return col.as_decimal_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.datetime64): + elif dtype.kind == "M": return col.as_datetime_column(dtype) - elif np.issubdtype(cast(Any, dtype), np.timedelta64): + elif dtype.kind == "m": return col.as_timedelta_column(dtype) elif dtype.kind == "O": if cudf.get_option("mode.pandas_compatible") and was_object: @@ -1846,21 +1842,11 @@ def as_column( and arbitrary.freq is not None ): raise NotImplementedError("freq is not implemented yet") - elif ( - isinstance(arbitrary.dtype, pd.DatetimeTZDtype) - or ( - isinstance(arbitrary.dtype, pd.IntervalDtype) - and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype) - ) - or ( - isinstance(arbitrary.dtype, pd.CategoricalDtype) - and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ) - ) + elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance( + arbitrary.dtype.subtype, pd.DatetimeTZDtype ): raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" + "cuDF does not yet support Intervals with timezone-aware datetimes" ) elif _is_pandas_nullable_extension_dtype(arbitrary.dtype): if cudf.get_option("mode.pandas_compatible"): @@ -1876,7 +1862,8 @@ def as_column( length=length, ) elif isinstance( - arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype) + arbitrary.dtype, + (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype), ): return as_column( pa.array(arbitrary, from_pandas=True), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7297ac4e929..732e5cdb01a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1757,13 +1757,10 @@ def __init__( name = _getdefault_name(data, name=name) data = column.as_column(data) - # TODO: Remove this if statement and fix tests now that - # there's timezone support - if isinstance(data.dtype, pd.DatetimeTZDtype): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - data = data.astype(dtype) + # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type) + # We probably shouldn't always astype to datetime64[ns] + if not isinstance(data.dtype, pd.DatetimeTZDtype): + data = data.astype(dtype) if copy: data = data.copy() diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 7ef55761b2b..58ffc610c3c 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected): def test_tz_convert_naive_typeerror(): with pytest.raises(TypeError): cudf.date_range("2020", periods=2, freq="D").tz_convert(None) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_from_pandas_obj_tz_aware(klass): + tz_aware_data = [ + pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific") + ] + pandas_obj = getattr(pd, klass)(tz_aware_data) + result = cudf.from_pandas(pandas_obj) + expected = getattr(cudf, klass)(tz_aware_data) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4186fff038a..e3ecaafae5b 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ - [pd.Timestamp("2001-01-01", tz="America/New_York")], - pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"), - ], -) -def test_construction_from_tz_timestamps(data): - with pytest.raises(NotImplementedError): - _ = cudf.Series(data) - with pytest.raises(NotImplementedError): - _ = cudf.Index(data) - with pytest.raises(NotImplementedError): - _ = cudf.DatetimeIndex(data) - with pytest.raises(NotImplementedError): - cudf.CategoricalIndex(data) - - @pytest.mark.parametrize("op", _cmpops) def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") @@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr): t.dt.ceil(freqstr) -def test_timezone_array_notimplemented(): +def test_timezone_pyarrow_array(): pa_array = pa.array( [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], type=pa.timestamp("ns", "UTC"), ) - with pytest.raises(NotImplementedError): - cudf.Series(pa_array) + result = cudf.Series(pa_array) + expected = pa_array.to_pandas() + assert_eq(result, expected) def test_to_datetime_errors_ignore_deprecated(): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 39800145585..f3e3911e6c7 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir): # Check result expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) dd.assert_eq(got, expected) + + +def test_timezone_column(tmpdir): + path = str(tmpdir.join("test.parquet")) + pdf = pd.DataFrame( + { + "time": pd.to_datetime( + ["1996-01-02", "1996-12-01"], + utc=True, + ), + "x": [1, 2], + } + ) + pdf.to_parquet(path) + got = dask_cudf.read_parquet(path) + # cudf.read_parquet does not support reading timezone aware types yet + assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC") + got["time"] = got["time"].astype("datetime64[ns]") + expected = cudf.read_parquet(path) + dd.assert_eq(got, expected) From f9b0fc3d1986d5ac8994c09229d62063854c0856 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 08:34:15 -1000 Subject: [PATCH 38/48] Preserve column type and class information in more DataFrame operations (#15949) Narrowing down to a pattern of using `ColumnAccessor._from_columns_like_self` to preserve the column information and then calling `Frame._from_data_like_self` to preserve the `.index`/`.name` information. This is specifically for operations that operates column wise and the result should be the same shape as the input. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15949 --- python/cudf/cudf/core/dataframe.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 131 +++++++++++------------ python/cudf/cudf/core/window/rolling.py | 41 ++----- python/cudf/cudf/tests/test_dataframe.py | 12 ++- 4 files changed, 83 insertions(+), 104 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9307267b227..e1b6cc45dd3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: self._data = ColumnAccessor( data=dict(zip(other.names, self._data.columns)), multiindex=other.multiindex, + rangeindex=other.rangeindex, level_names=other.level_names, label_dtype=other.label_dtype, verify=False, @@ -7534,7 +7535,7 @@ def _sample_axis_1( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: abc.Iterable[str], + column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, *, override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d898eb4b9c3..fdc78005996 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,8 +40,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_decimal_dtype, is_dict_like, is_list_like, is_scalar, @@ -372,7 +370,6 @@ def _mimic_inplace( self._index = result.index return super()._mimic_inplace(result, inplace) - # Scans @_cudf_nvtx_annotate def _scan(self, op, axis=None, skipna=True): """ @@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True): cast_to_int = op in ("cumsum", "cumprod") skipna = True if skipna is None else skipna - results = {} - for name, col in self._data.items(): + results = [] + for col in self._columns: if skipna: result_col = col.nans_to_nulls() else: @@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True): else: result_col = col - if ( - cast_to_int - and not is_decimal_dtype(result_col.dtype) - and ( - np.issubdtype(result_col.dtype, np.integer) - or np.issubdtype(result_col.dtype, np.bool_) - ) - ): + if cast_to_int and result_col.dtype.kind in "uib": # For reductions that accumulate a value (e.g. sum, not max) # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) - results[name] = getattr(result_col, op)() - return self._from_data(results, self.index) + results.append(getattr(result_col, op)()) + return self._from_data_like_self( + self._data._from_columns_like_self(results) + ) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the @@ -880,7 +872,6 @@ def replace( FutureWarning, ) if not (to_replace is None and value is no_default): - copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -890,10 +881,10 @@ def replace( value=value, columns_dtype_map=dict(self._dtypes), ) - + copy_data = [] for name, col in self._data.items(): try: - copy_data[name] = col.find_and_replace( + replaced = col.find_and_replace( to_replace_per_column[name], replacements_per_column[name], all_na_per_column[name], @@ -906,11 +897,13 @@ def replace( # that exists in `copy_data`. # ii. There is an OverflowError while trying to cast # `to_replace_per_column` to `replacements_per_column`. - copy_data[name] = col.copy(deep=True) + replaced = col.copy(deep=True) + copy_data.append(replaced) + result = self._from_data_like_self( + self._data._from_columns_like_self(copy_data) + ) else: - copy_data = self._data.copy(deep=True) - - result = self._from_data(copy_data, self.index) + result = self.copy() return self._mimic_inplace(result, inplace=inplace) @@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): ): lower[0], upper[0] = upper[0], lower[0] - data = { - name: col.clip(lower[i], upper[i]) - for i, (name, col) in enumerate(self._data.items()) - } - output = self._from_data(data, self.index) - output._copy_type_metadata(self, include_index=False) + data = ( + col.clip(low, high) + for col, low, high in zip(self._columns, lower, upper) + ) + output = self._from_data_like_self( + self._data._from_columns_like_self(data) + ) return self._mimic_inplace(output, inplace=inplace) @_cudf_nvtx_annotate @@ -1913,7 +1907,7 @@ def nans_to_nulls(self): 2 """ result = [] - for col in self._data.columns: + for col in self._columns: converted = col.nans_to_nulls() if converted is col: converted = converted.copy() @@ -2028,8 +2022,8 @@ def interpolate( ) interpolator = cudf.core.algorithms.get_column_interpolator(method) - columns = {} - for colname, col in data._data.items(): + columns = [] + for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): warnings.warn( f"{type(self).__name__}.interpolate with object dtype is " @@ -2040,9 +2034,12 @@ def interpolate( col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - columns[colname] = interpolator(col, index=data.index) + columns.append(interpolator(col, index=data.index)) - result = self._from_data(columns, index=data.index) + result = self._from_data_like_self( + self._data._from_columns_like_self(columns) + ) + result.index = data.index return ( result @@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): data_columns = ( col.shift(periods, fill_value) for col in self._columns ) - return self.__class__._from_data( - zip(self._column_names, data_columns), self.index + return self._from_data_like_self( + self._data._from_columns_like_self(data_columns) ) @_cudf_nvtx_annotate @@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self.index.names, ) - result._data.label_dtype = self._data.label_dtype - result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3561,11 +3556,6 @@ def sort_values( ), keep_index=not ignore_index, ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out.columns = self._data.to_pandas_index() return out def _n_largest_or_smallest( @@ -3659,14 +3649,12 @@ def _align_to_index( result = result.sort_values(sort_col_id) del result[sort_col_id] - result = self.__class__._from_data( - data=result._data, index=result.index + out = self._from_data( + self._data._from_columns_like_self(result._columns) ) - result._data.multiindex = self._data.multiindex - result._data._level_names = self._data._level_names - result.index.names = self.index.names - - return result + out.index = result.index + out.index.names = self.index.names + return out @_cudf_nvtx_annotate def _reindex( @@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"): "decimals must be an integer, a dict-like or a Series" ) - cols = { - name: col.round(decimals[name], how=how) - if ( - name in decimals - and _is_non_decimal_numeric_dtype(col.dtype) - and not is_bool_dtype(col.dtype) - ) + cols = ( + col.round(decimals[name], how=how) + if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) for name, col in self._data.items() - } - - return self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - ), - index=self.index, + ) + return self._from_data_like_self( + self._data._from_columns_like_self(cols) ) def resample( @@ -6238,6 +6216,8 @@ def rank( f"axis={axis} is not yet supported in rank" ) + num_cols = self._num_columns + dropped_cols = False source = self if numeric_only: if isinstance( @@ -6255,15 +6235,28 @@ def rank( source = self._get_columns_by_label(numeric_cols) if source.empty: return source.astype("float64") + elif source._num_columns != num_cols: + dropped_cols = True result_columns = libcudf.sort.rank_columns( [*source._columns], method_enum, na_option, ascending, pct ) - return self.__class__._from_data( - dict(zip(source._column_names, result_columns)), - index=source.index, - ).astype(np.float64) + if dropped_cols: + result = type(source)._from_data( + ColumnAccessor( + dict(zip(source._column_names, result_columns)), + multiindex=self._data.multiindex, + level_names=self._data.level_names, + label_dtype=self._data.label_dtype, + ), + ) + else: + result = source._from_data_like_self( + self._data._from_columns_like_self(result_columns) + ) + result.index = source.index + return result.astype(np.float64) def convert_dtypes( self, diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2037b1682db..7d140a1ffa5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION -import itertools - import numba import pandas as pd from pandas.api.indexers import BaseIndexer @@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg_dataframe(self, df, agg_name): - return cudf.DataFrame._from_data( - { - col_name: self._apply_agg_column(col, agg_name) - for col_name, col in df._data.items() - }, - index=df.index, - ) - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return cudf.Series._from_data( - { - self.obj.name: self._apply_agg_column( - self.obj._column, agg_name - ) - }, - index=self.obj.index, - ) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) def _reduce( self, @@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window): ) def _apply_agg(self, agg_name): - index = cudf.MultiIndex.from_frame( - cudf.DataFrame( - { - key: value - for key, value in itertools.chain( - self._group_keys._data.items(), - self.obj.index._data.items(), - ) - } - ) + index = cudf.MultiIndex._from_data( + {**self._group_keys._data, **self.obj.index._data} ) - result = super()._apply_agg(agg_name) result.index = index return result diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d76d5eb8065..98e9f9881c7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10980,7 +10980,7 @@ def test_squeeze(axis, data): assert_eq(result, expected) -@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) @pytest.mark.parametrize( "operation", [ @@ -10991,6 +10991,16 @@ def test_squeeze(axis, data): lambda df: abs(df), lambda df: -df, lambda df: ~df, + lambda df: df.cumsum(), + lambda df: df.replace(1, 2), + lambda df: df.replace(10, 20), + lambda df: df.clip(0, 10), + lambda df: df.rolling(1).mean(), + lambda df: df.interpolate(), + lambda df: df.shift(), + lambda df: df.sort_values(1), + lambda df: df.round(), + lambda df: df.rank(), ], ) def test_op_preserves_column_metadata(column, operation): From 58a15a84078c42b331ced4fd4384724d42328258 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 10 Jun 2024 11:42:11 -0700 Subject: [PATCH 39/48] Explicitly build for all GPU architectures (#15959) The libcudf conda package is not specifying to build for all supported architectures and is instead letting build.sh fall back to NATIVE. However, because the default behavior of rapids-cmake is to build SASS for all supported architectures if NATIVE is specified but no local architecture is detected, we're still ending up with all of the RAPIDS architectures having SASS built for them. The problem is that we are failing to build PTX for the latest version, which would be produced if we used RAPIDS instead of NATIVE. This PR should resolve that issue. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert Maynard (https://github.com/robertmaynard) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/15959 --- conda/recipes/libcudf/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh index fef3dabd733..a3a0415575b 100644 --- a/conda/recipes/libcudf/build.sh +++ b/conda/recipes/libcudf/build.sh @@ -5,5 +5,5 @@ export cudf_ROOT="$(realpath ./cpp/build)" ./build.sh -n -v \ libcudf libcudf_kafka benchmarks tests \ - --build_metrics --incl_cache_stats \ + --build_metrics --incl_cache_stats --allgpuarch \ --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\" From 719a8a6934ae5eaeb22764d1bfdeb75893750bae Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Mon, 10 Jun 2024 15:57:17 -0400 Subject: [PATCH 40/48] Update PandasCompat.py to resolve references (#15704) This PR allows the PandasCompat sphinx ext to contain resolved references. For example, you can now add intersphinx mapping to the content of the admonition. ### Motivation I enjoy connecting the PyData communities and this PR allows for more opportunities to use intersphinx mapping to link back to the pandas docs. ### History I first tried this in a previous PR (https://github.com/rapidsai/cudf/pull/15383#discussion_r1537888240) and commented here (https://github.com/rapidsai/cudf/pull/15383#issuecomment-2028451487) that I may get around to investigating this further. I finally had to time to work on this and made a bit of progress. ### Testing I created a separate repo for this at https://github.com/raybellwaves/compatsphinxext which deploys straight to https://raybellwaves.github.io/compatsphinxext you can see it's working as expected here: https://raybellwaves.github.io/compatsphinxext/compat.html. You should be able to fork that and tinker pretty quickly. ### Further work This could be cleaned up (for example I couldn't get the [source] to display in the admonition as I worked from the latest sphinx todo extension (https://github.com/sphinx-doc/sphinx/blob/master/sphinx/ext/todo.py)). The existing pandas-compat Admonition's could be switched to this if agreed. In addition, the documentation around how to write pandas-compat entries going forward (https://github.com/rapidsai/cudf/blob/branch-24.06/docs/cudf/source/developer_guide/documentation.md#comparing-to-pandas) will also have to be updated. Longer term the extension could be published and used across RAPIDS libraries where there are differences in compatibility with PyData libraries e.g. pandas, network, scikit-learn to simplify linking to those dos. I'm not sure if I'll have time to work on this though. Authors: - Ray Bell (https://github.com/raybellwaves) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15704 --- docs/cudf/source/_ext/PandasCompat.py | 143 +++++++++++++++++--------- docs/cudf/source/conf.py | 2 + 2 files changed, 94 insertions(+), 51 deletions(-) diff --git a/docs/cudf/source/_ext/PandasCompat.py b/docs/cudf/source/_ext/PandasCompat.py index af2b16035c3..331495c981e 100644 --- a/docs/cudf/source/_ext/PandasCompat.py +++ b/docs/cudf/source/_ext/PandasCompat.py @@ -1,14 +1,20 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION +# Copyright (c) 2021-2024, NVIDIA CORPORATION # This file is adapted from official sphinx tutorial for `todo` extension: # https://www.sphinx-doc.org/en/master/development/tutorials/todo.html +from __future__ import annotations + +from typing import cast from docutils import nodes +from docutils.nodes import Element from docutils.parsers.rst import Directive -from sphinx.locale import get_translation -from sphinx.util.docutils import SphinxDirective - -translator = get_translation("sphinx") +from docutils.parsers.rst.directives.admonitions import BaseAdmonition +from sphinx import addnodes +from sphinx.domains import Domain +from sphinx.errors import NoUri +from sphinx.locale import _ as get_translation_sphinx +from sphinx.util.docutils import SphinxDirective, new_document class PandasCompat(nodes.Admonition, nodes.Element): @@ -32,7 +38,7 @@ def run(self): return [PandasCompatList("")] -class PandasCompatDirective(SphinxDirective): +class PandasCompatDirective(BaseAdmonition, SphinxDirective): # this enables content in the directive has_content = True @@ -43,9 +49,11 @@ def run(self): PandasCompat_node = PandasCompat("\n".join(self.content)) PandasCompat_node += nodes.title( - translator("Pandas Compatibility Note"), - translator("Pandas Compatibility Note"), + get_translation_sphinx("Pandas Compatibility Note"), + get_translation_sphinx("Pandas Compatibility Note"), ) + PandasCompat_node["docname"] = self.env.docname + PandasCompat_node["target"] = targetnode self.state.nested_parse( self.content, self.content_offset, PandasCompat_node ) @@ -84,71 +92,104 @@ def merge_PandasCompats(app, env, docnames, other): ) -def process_PandasCompat_nodes(app, doctree, fromdocname): - if not app.config.include_pandas_compat: - for node in doctree.traverse(PandasCompat): - node.parent.remove(node) +class PandasCompatDomain(Domain): + name = "pandascompat" + label = "pandascompat" - # Replace all PandasCompatList nodes with a list of the collected - # PandasCompats. Augment each PandasCompat with a backlink to the - # original location. - env = app.builder.env + @property + def pandascompats(self): + return self.data.setdefault("pandascompats", {}) - if not hasattr(env, "PandasCompat_all_pandas_compat"): - env.PandasCompat_all_pandas_compat = [] + def clear_doc(self, docname): + self.pandascompats.pop(docname, None) + + def merge_domaindata(self, docnames, otherdata): + for docname in docnames: + self.pandascompats[docname] = otherdata["pandascompats"][docname] + + def process_doc(self, env, docname, document): + pandascompats = self.pandascompats.setdefault(docname, []) + for pandascompat in document.findall(PandasCompat): + env.app.emit("pandascompat-defined", pandascompat) + pandascompats.append(pandascompat) - for node in doctree.traverse(PandasCompatList): - if not app.config.include_pandas_compat: - node.replace_self([]) - continue - content = [] +class PandasCompatListProcessor: + def __init__(self, app, doctree, docname): + self.builder = app.builder + self.config = app.config + self.env = app.env + self.domain = cast(PandasCompatDomain, app.env.get_domain("pandascompat")) + self.document = new_document("") + self.process(doctree, docname) - for PandasCompat_info in env.PandasCompat_all_pandas_compat: - para = nodes.paragraph() + def process(self, doctree: nodes.document, docname: str) -> None: + pandascompats = [v for vals in self.domain.pandascompats.values() for v in vals] + for node in doctree.findall(PandasCompatList): + if not self.config.include_pandas_compat: + node.parent.remove(node) + continue - # Create a reference back to the original docstring - newnode = nodes.reference("", "") - innernode = nodes.emphasis( - translator("[source]"), translator("[source]") - ) - newnode["refdocname"] = PandasCompat_info["docname"] - newnode["refuri"] = app.builder.get_relative_uri( - fromdocname, PandasCompat_info["docname"] - ) - newnode["refuri"] += "#" + PandasCompat_info["target"]["refid"] - newnode.append(innernode) - para += newnode + content: list[Element | None] = [nodes.target()] if node.get("ids") else [] - # Insert the reference node into PandasCompat node - # Note that this node is a deepcopy from the original copy - # in the docstring, so changing this does not affect that in the - # doc. - PandasCompat_info["PandasCompat"].append(para) + for pandascompat in pandascompats: + # Create a copy of the pandascompat node + new_pandascompat = pandascompat.deepcopy() + new_pandascompat["ids"].clear() - # Insert the PandasCompand node into the PandasCompatList Node - content.append(PandasCompat_info["PandasCompat"]) + self.resolve_reference(new_pandascompat, docname) + content.append(new_pandascompat) - node.replace_self(content) + ref = self.create_reference(pandascompat, docname) + content.append(ref) + + node.replace_self(content) + + def create_reference(self, pandascompat, docname): + para = nodes.paragraph() + newnode = nodes.reference("", "") + innernode = nodes.emphasis( + get_translation_sphinx("[source]"), get_translation_sphinx("[source]") + ) + newnode["refdocname"] = pandascompat["docname"] + try: + newnode["refuri"] = self.builder.get_relative_uri( + docname, pandascompat["docname"] + ) + "#" + pandascompat["target"]["refid"] + except NoUri: + # ignore if no URI can be determined, e.g. for LaTeX output + pass + newnode.append(innernode) + para += newnode + return para + + def resolve_reference(self, todo, docname: str) -> None: + """Resolve references in the todo content.""" + for node in todo.findall(addnodes.pending_xref): + if "refdoc" in node: + node["refdoc"] = docname + + # Note: To resolve references, it is needed to wrap it with document node + self.document += todo + self.env.resolve_references(self.document, docname, self.builder) + self.document.remove(todo) def setup(app): app.add_config_value("include_pandas_compat", False, "html") - app.add_node(PandasCompatList) app.add_node( PandasCompat, html=(visit_PandasCompat_node, depart_PandasCompat_node), latex=(visit_PandasCompat_node, depart_PandasCompat_node), text=(visit_PandasCompat_node, depart_PandasCompat_node), + man=(visit_PandasCompat_node, depart_PandasCompat_node), + texinfo=(visit_PandasCompat_node, depart_PandasCompat_node), ) - - # Sphinx directives are lower-cased app.add_directive("pandas-compat", PandasCompatDirective) app.add_directive("pandas-compat-list", PandasCompatListDirective) - app.connect("doctree-resolved", process_PandasCompat_nodes) - app.connect("env-purge-doc", purge_PandasCompats) - app.connect("env-merge-info", merge_PandasCompats) + app.add_domain(PandasCompatDomain) + app.connect("doctree-resolved", PandasCompatListProcessor) return { "version": "0.1", diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 73d8b4445d3..e9c760e288e 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -617,6 +617,8 @@ def linkcode_resolve(domain, info) -> str | None: f"branch-{version}/python/cudf/cudf/{fn}{linespec}" ) +# Needed for avoid build warning for PandasCompat extension +suppress_warnings = ["myst.domains"] def setup(app): app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") From 570df6c5fbb0a2120b539aba0a65702c2190527f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:24:40 -1000 Subject: [PATCH 41/48] Add typing to single_column_frame (#15965) Also removes an extra copy from `.flatten()` when calling `.values` or `.values_host` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/15965 --- python/cudf/cudf/api/types.py | 7 ++- python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/core/single_column_frame.py | 58 ++++++++------------ 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 417d8b0922a..42b1524bd76 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -8,7 +8,7 @@ from collections import abc from functools import wraps from inspect import isclass -from typing import List, Union +from typing import List, Union, cast import cupy as cp import numpy as np @@ -238,7 +238,10 @@ def _union_categoricals( raise TypeError("ignore_order is not yet implemented") result_col = cudf.core.column.CategoricalColumn._concat( - [obj._column for obj in to_union] + [ + cast(cudf.core.column.CategoricalColumn, obj._column) + for obj in to_union + ] ) if sort_categories: sorted_categories = result_col.categories.sort_values(ascending=True) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f87797a1fa3..7abdbc85720 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -212,7 +212,7 @@ def to_pandas( return pd.Index(pa_array.to_pandas()) @property - def values_host(self) -> "np.ndarray": + def values_host(self) -> np.ndarray: """ Return a numpy representation of the Column. """ @@ -226,7 +226,7 @@ def values_host(self) -> "np.ndarray": return self.data_array_view(mode="read").copy_to_host() @property - def values(self) -> "cupy.ndarray": + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the Column. """ diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d864b563208..acc74129a29 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -7,9 +7,11 @@ import cupy import numpy +import pyarrow as pa +from typing_extensions import Self import cudf -from cudf._typing import Dtype, NotImplementedType, ScalarLike +from cudf._typing import NotImplementedType, ScalarLike from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -27,8 +29,8 @@ class SingleColumnFrame(Frame, NotIterable): """A one-dimensional frame. - Frames with only a single column share certain logic that is encoded in - this class. + Frames with only a single column (Index or Series) + share certain logic that is encoded in this class. """ _SUPPORT_AXIS_LOOKUP = { @@ -47,7 +49,7 @@ def _reduce( if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") - if numeric_only and not is_numeric_dtype(self._column): + if numeric_only and not is_numeric_dtype(self.dtype): raise TypeError( f"Series.{op} does not allow numeric_only={numeric_only} " "with non-numeric dtypes." @@ -68,7 +70,7 @@ def _scan(self, op, axis=None, *args, **kwargs): @_cudf_nvtx_annotate def name(self): """Get the name of this object.""" - return next(iter(self._data.names)) + return next(iter(self._column_names)) @name.setter # type: ignore @_cudf_nvtx_annotate @@ -83,7 +85,7 @@ def ndim(self) -> int: # noqa: D401 @property # type: ignore @_cudf_nvtx_annotate - def shape(self): + def shape(self) -> tuple[int]: """Get a tuple representing the dimensionality of the Index.""" return (len(self),) @@ -95,45 +97,27 @@ def __bool__(self): @property # type: ignore @_cudf_nvtx_annotate - def _num_columns(self): + def _num_columns(self) -> int: return 1 @property # type: ignore @_cudf_nvtx_annotate - def _column(self): - return self._data[self.name] + def _column(self) -> ColumnBase: + return next(iter(self._columns)) @property # type: ignore @_cudf_nvtx_annotate - def values(self): # noqa: D102 + def values(self) -> cupy.ndarray: # noqa: D102 return self._column.values @property # type: ignore @_cudf_nvtx_annotate - def values_host(self): # noqa: D102 + def values_host(self) -> numpy.ndarray: # noqa: D102 return self._column.values_host - @_cudf_nvtx_annotate - def to_cupy( - self, - dtype: Union[Dtype, None] = None, - copy: bool = True, - na_value=None, - ) -> cupy.ndarray: # noqa: D102 - return super().to_cupy(dtype, copy, na_value).flatten() - - @_cudf_nvtx_annotate - def to_numpy( - self, - dtype: Union[Dtype, None] = None, - copy: bool = True, - na_value=None, - ) -> numpy.ndarray: # noqa: D102 - return super().to_numpy(dtype, copy, na_value).flatten() - @classmethod @_cudf_nvtx_annotate - def from_arrow(cls, array): + def from_arrow(cls, array) -> Self: """Create from PyArrow Array/ChunkedArray. Parameters @@ -164,7 +148,7 @@ def from_arrow(cls, array): return cls(ColumnBase.from_arrow(array)) @_cudf_nvtx_annotate - def to_arrow(self): + def to_arrow(self) -> pa.Array: """ Convert to a PyArrow Array. @@ -196,7 +180,7 @@ def to_arrow(self): @property # type: ignore @_cudf_nvtx_annotate - def is_unique(self): + def is_unique(self) -> bool: """Return boolean if values in the object are unique. Returns @@ -207,7 +191,7 @@ def is_unique(self): @property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """Return boolean if values in the object are monotonically increasing. Returns @@ -218,7 +202,7 @@ def is_monotonic_increasing(self): @property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """Return boolean if values in the object are monotonically decreasing. Returns @@ -243,7 +227,9 @@ def __cuda_array_interface__(self): ) @_cudf_nvtx_annotate - def factorize(self, sort=False, use_na_sentinel=True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, cudf.Index]: """Encode the input values as integer labels. Parameters @@ -335,7 +321,7 @@ def _make_operands_for_binop( return {result_name: (self._column, other, reflect, fill_value)} @_cudf_nvtx_annotate - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> int: """ Return count of unique values for the column. From 1bd210d76ab05c669aea230b9287b76a03328efa Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Mon, 10 Jun 2024 21:35:46 -0400 Subject: [PATCH 42/48] Add external issue label and project automation (#15945) This PR creates two new GitHub Actions around issue and PR tracking ### `external_issue_labeler.yml` This action automatically adds a label, currently `External`, to any issue or PR that is opened by someone that is not either an owner, member, or collaborator to the cuDF repo ### `pr_issue_status_automation.yml` This action uses the [shared workflows](https://github.com/rapidsai/shared-workflows/tree/branch-24.08/.github/workflows) in rapdsai/shared-workflows to, on open/edit/synchronize of an open PR, to: 1. Set the PR to `in progress` 2. Set all linked issues `in progress` 3. Set the PR's sprint to the current iteration 4. Set all linked issues to the current iteration Edit triggers on edit of the PR description, (so new linked issues will get synchronized to `in progress`). Synchronize triggers on push and rebase events - this really is to cover the "what are we working on right now" because anything we touch goes into the current sprint in the project. Authors: - Ben Jarmak (https://github.com/jarmak-nv) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15945 --- .github/workflows/external_issue_labeler.yml | 55 ++++++++++++++++ .../workflows/pr_issue_status_automation.yml | 64 +++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 .github/workflows/external_issue_labeler.yml create mode 100644 .github/workflows/pr_issue_status_automation.yml diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml new file mode 100644 index 00000000000..e6d987e9f34 --- /dev/null +++ b/.github/workflows/external_issue_labeler.yml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Label external issues and PRs + +on: + issues: + types: + - opened + + pull_request: + types: + - opened + +env: + GITHUB_TOKEN: ${{ github.token }} + +permissions: + issues: write + pull-requests: write + +jobs: + Label-Issue: + runs-on: ubuntu-latest + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} + steps: + - name: add-external-labels + run: | + issue_url=${{ github.event.issue.html_url }} + gh issue edit ${issue_url} --add-label "External" + continue-on-error: true + + Label-PR: + runs-on: ubuntu-latest + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} + steps: + - name: add-external-labels + run: | + pr_url=${{ github.event.pull_request.html_url }} + gh issue edit ${pr_url} --add-label "External" + continue-on-error: true diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml new file mode 100644 index 00000000000..aaece1bfa3e --- /dev/null +++ b/.github/workflows/pr_issue_status_automation.yml @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Set PR and Issue Project Fields + +on: + pull_request_target: + # This job runs when a PR is first opened, or it is updated + # Only runs if the PR is open (we don't want to update the status of a closed PR) + types: [opened, edited, synchronize] + +jobs: + get-project-id: + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + secrets: inherit + permissions: + contents: read + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + + update-status: + # This job sets the PR and its linked issues to "In Progress" status + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + needs: get-project-id + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgaxNac" + SINGLE_SELECT_FIELD_NAME: "Status" + SINGLE_SELECT_OPTION_VALUE: "In Progress" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit + + update-sprint: + # This job sets the PR and its linked issues to the current "Weekly Sprint" + uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + if: github.event.pull_request.state == 'open' + needs: get-project-id + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + ITERATION_FIELD_ID: "PVTIF_lADOAp2shc4AiNzlzgbU_po" + ITERATION_FIELD_NAME: "Weekly Sprint" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit From ff1e4bb82ce4ab8ac54bc8715bf761a3700024bc Mon Sep 17 00:00:00 2001 From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com> Date: Mon, 10 Jun 2024 19:34:00 -0700 Subject: [PATCH 43/48] Migrate left join and conditional join benchmarks to use nvbench (#15931) The current [left join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/left_join.cu) and [conditional join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/conditional_join.cu) benchmarks are still using gbench. This PR migrates the **left join** and **conditional join** benchmarks to use **nvbench**. Closes #15699. - [x] Migrate from gbench to nvbench - [x] Similar to #15644, use `JOIN_KEY_TYPE_RANGE`, `JOIN_NULLABLE_RANGE` and `JOIN_SIZE_RANGE` to reduce the number of test cases and simplify the implementation - [x] Get rid of the dispatching between gbench and nvbench in [join_common.hpp](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/join_common.hpp) Authors: - Srinivas Yadav (https://github.com/srinivasyadav18) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15931 --- cpp/benchmarks/CMakeLists.txt | 6 +- cpp/benchmarks/join/conditional_join.cu | 288 ++++-------------------- cpp/benchmarks/join/join_common.hpp | 99 +++----- cpp/benchmarks/join/left_join.cu | 152 ++++--------- 4 files changed, 116 insertions(+), 429 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 10f645dfec0..49504e53424 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -163,8 +163,10 @@ ConfigureNVBench( # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- -ConfigureBench(JOIN_BENCH join/left_join.cu join/conditional_join.cu) -ConfigureNVBench(JOIN_NVBENCH join/join.cu join/mixed_join.cu join/distinct_join.cu) +ConfigureNVBench( + JOIN_NVBENCH join/left_join.cu join/conditional_join.cu join/join.cu join/mixed_join.cu + join/distinct_join.cu +) # ################################################################################################## # * iterator benchmark ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu index d95fc0a5b59..e332d09d31b 100644 --- a/cpp/benchmarks/join/conditional_join.cu +++ b/cpp/benchmarks/join/conditional_join.cu @@ -14,250 +14,44 @@ * limitations under the License. */ -#include - -template -class ConditionalJoin : public cudf::benchmark {}; - -// For compatibility with the shared logic for equality (hash) joins, all of -// the join lambdas defined by these macros accept a null_equality parameter -// but ignore it (don't forward it to the underlying join implementation) -// because conditional joins do not use this parameter. -#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_inner_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true); -CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_full_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true); -CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_anti_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true); - -#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::ast::operation binary_pred, \ - cudf::null_equality compare_nulls) { \ - return cudf::conditional_left_semi_join(left, right, binary_pred); \ - }; \ - BM_join(st, join); \ - } - -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true); -CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true); - -// inner join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({400'000, 100'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// full join ----------------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left anti-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -// left semi-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->UseManualTime(); +#include "join_common.hpp" + +template +void nvbench_conditional_inner_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::ast::operation binary_pred, + cudf::null_equality compare_nulls) { + return cudf::conditional_inner_join(left, right, binary_pred); + }; + BM_join(state, join); +} + +template +void nvbench_conditional_left_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::ast::operation binary_pred, + cudf::null_equality compare_nulls) { + return cudf::conditional_left_join(left, right, binary_pred); + }; + BM_join(state, join); +} + +NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("conditional_inner_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); + +NVBENCH_BENCH_TYPES(nvbench_conditional_left_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("conditional_left_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index e6792b9dbfb..3d9d9c57548 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -19,7 +19,6 @@ #include "generate_input_tables.cuh" #include -#include #include #include @@ -67,28 +66,12 @@ template void BM_join(state_type& state, Join JoinFunc) { - auto const right_size = [&]() { - if constexpr (std::is_same_v) { - return static_cast(state.range(0)); - } - if constexpr (std::is_same_v) { - return static_cast(state.get_int64("right_size")); - } - }(); - auto const left_size = [&]() { - if constexpr (std::is_same_v) { - return static_cast(state.range(1)); - } - if constexpr (std::is_same_v) { - return static_cast(state.get_int64("left_size")); - } - }(); + auto const right_size = static_cast(state.get_int64("right_size")); + auto const left_size = static_cast(state.get_int64("left_size")); - if constexpr (std::is_same_v) { - if (right_size > left_size) { - state.skip("Skip large right table"); - return; - } + if (right_size > left_size) { + state.skip("Skip large right table"); + return; } double const selectivity = 0.3; @@ -165,57 +148,37 @@ void BM_join(state_type& state, Join JoinFunc) // Setup join parameters and result table [[maybe_unused]] std::vector columns_to_join = {0}; - - // Benchmark the inner join operation - if constexpr (std::is_same_v and - (join_type != join_t::CONDITIONAL)) { - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - cudf::null_equality::UNEQUAL); - } - } - if constexpr (std::is_same_v and (join_type != join_t::CONDITIONAL)) { - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - if constexpr (join_type == join_t::MIXED) { - auto const col_ref_left_0 = cudf::ast::column_reference(0); - auto const col_ref_right_0 = - cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); - auto left_zero_eq_right_zero = - cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - left_table.select({1}), - right_table.select({1}), - left_zero_eq_right_zero, - cudf::null_equality::UNEQUAL); - }); - } - if constexpr (join_type == join_t::HASH) { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = JoinFunc(left_table.select(columns_to_join), - right_table.select(columns_to_join), - cudf::null_equality::UNEQUAL); - }); - } - } - - // Benchmark conditional join - if constexpr (std::is_same_v and join_type == join_t::CONDITIONAL) { - // Common column references. + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + if constexpr (join_type == join_t::CONDITIONAL) { auto const col_ref_left_0 = cudf::ast::column_reference(0); auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); auto left_zero_eq_right_zero = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); - } + ; + }); + } + if constexpr (join_type == join_t::MIXED) { + auto const col_ref_left_0 = cudf::ast::column_reference(0); + auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = JoinFunc(left_table.select(columns_to_join), + right_table.select(columns_to_join), + left_table.select({1}), + right_table.select({1}), + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + }); + } + if constexpr (join_type == join_t::HASH) { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = JoinFunc(left_table.select(columns_to_join), + right_table.select(columns_to_join), + cudf::null_equality::UNEQUAL); + }); } } diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu index 3e398e721fa..92123ce1621 100644 --- a/cpp/benchmarks/join/left_join.cu +++ b/cpp/benchmarks/join/left_join.cu @@ -14,115 +14,43 @@ * limitations under the License. */ -#include - -template -class Join : public cudf::benchmark {}; - -#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::null_equality compare_nulls) { \ - return cudf::left_anti_join(left, right, compare_nulls); \ - }; \ - BM_join(st, join); \ - } - -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true); -LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true); - -#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key) \ - (::benchmark::State & st) \ - { \ - auto join = [](cudf::table_view const& left, \ - cudf::table_view const& right, \ - cudf::null_equality compare_nulls) { \ - return cudf::left_semi_join(left, right, compare_nulls); \ - }; \ - BM_join(st, join); \ - } - -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true); -LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true); - -// left anti-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(Join, left_anti_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -// left semi-join ------------------------------------------------------------- -BENCHMARK_REGISTER_F(Join, left_semi_join_32bit) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_64bit) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({100'000, 100'000}) - ->Args({100'000, 400'000}) - ->Args({100'000, 1'000'000}) - ->Args({10'000'000, 10'000'000}) - ->Args({10'000'000, 40'000'000}) - ->Args({10'000'000, 100'000'000}) - ->Args({100'000'000, 100'000'000}) - ->Args({80'000'000, 240'000'000}) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls) - ->Unit(benchmark::kMillisecond) - ->Args({50'000'000, 50'000'000}) - ->Args({40'000'000, 120'000'000}) - ->UseManualTime(); +#include "join_common.hpp" + +template +void nvbench_left_anti_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + return cudf::left_anti_join(left, right, compare_nulls); + }; + + BM_join(state, join); +} + +template +void nvbench_left_semi_join(nvbench::state& state, + nvbench::type_list>) +{ + auto join = [](cudf::table_view const& left, + cudf::table_view const& right, + cudf::null_equality compare_nulls) { + return cudf::left_semi_join(left, right, compare_nulls); + }; + BM_join(state, join); +} + +NVBENCH_BENCH_TYPES(nvbench_left_anti_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("left_anti_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); + +NVBENCH_BENCH_TYPES(nvbench_left_semi_join, + NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE)) + .set_name("left_semi_join") + .set_type_axes_names({"Key", "Nullable"}) + .add_int64_axis("left_size", JOIN_SIZE_RANGE) + .add_int64_axis("right_size", JOIN_SIZE_RANGE); From 66c2f4fded3aa5d83745fada3e4c4d5eee7895b2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 11 Jun 2024 07:24:19 -0700 Subject: [PATCH 44/48] Condense pylibcudf data fixtures (#15958) Condense all pa_foo/plc_foo data fixtures into just foo, as recommended by https://github.com/rapidsai/cudf/pull/15839#discussion_r1626769872. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15958 --- .../cudf/cudf/pylibcudf_tests/test_copying.py | 499 ++++++++++-------- .../cudf/pylibcudf_tests/test_quantiles.py | 16 +- .../cudf/cudf/pylibcudf_tests/test_reshape.py | 20 +- .../pylibcudf_tests/test_string_capitalize.py | 54 +- .../pylibcudf_tests/test_string_contains.py | 15 +- .../cudf/pylibcudf_tests/test_string_find.py | 78 ++- 6 files changed, 358 insertions(+), 324 deletions(-) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index cd70ce4abf5..da3ca3a6d1e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -20,121 +20,104 @@ # TODO: Test nullable data @pytest.fixture(scope="module") -def pa_input_column(pa_type): +def input_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([1, 2, 3], type=pa_type) + pa_array = pa.array([1, 2, 3], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["a", "b", "c"], type=pa_type) + pa_array = pa.array(["a", "b", "c"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([True, True, False], type=pa_type) + pa_array = pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[1], [2], [3]], type=pa_type) + pa_array = pa.array([[1], [2], [3]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def input_column(pa_input_column): - return plc.interop.from_arrow(pa_input_column) + pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_index_column(): +def index_column(): # Index column for testing gather/scatter, always integral. - return pa.array([1, 2, 3]) - - -@pytest.fixture(scope="module") -def index_column(pa_index_column): - return plc.interop.from_arrow(pa_index_column) + pa_array = pa.array([1, 2, 3]) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def pa_target_column(pa_type): +def target_column(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.array([4, 5, 6, 7, 8, 9], type=pa_type) + pa_array = pa.array([4, 5, 6, 7, 8, 9], type=pa_type) elif pa.types.is_string(pa_type): - return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) + pa_array = pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.array([False, True, True, False, True, False], type=pa_type) + pa_array = pa.array( + [False, True, True, False, True, False], type=pa_type + ) elif pa.types.is_list(pa_type): # TODO: Add heterogenous sizes - return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.array( + pa_array = pa.array( [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], type=pa_type, ) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def target_column(pa_target_column): - return plc.interop.from_arrow(pa_target_column) + else: + raise ValueError("Unsupported type") + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture def mutable_target_column(target_column): - return target_column.copy() + _, plc_target_column = target_column + return plc_target_column.copy() @pytest.fixture(scope="module") -def pa_source_table(pa_input_column): - return pa.table([pa_input_column] * 3, [""] * 3) +def source_table(input_column): + pa_input_column, _ = input_column + pa_table = pa.table([pa_input_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def source_table(pa_source_table): - return plc.interop.from_arrow(pa_source_table) +def target_table(target_column): + pa_target_column, _ = target_column + pa_table = pa.table([pa_target_column] * 3, [""] * 3) + return pa_table, plc.interop.from_arrow(pa_table) @pytest.fixture(scope="module") -def pa_target_table(pa_target_column): - return pa.table([pa_target_column] * 3, [""] * 3) - - -@pytest.fixture(scope="module") -def target_table(pa_target_table): - return plc.interop.from_arrow(pa_target_table) - - -@pytest.fixture(scope="module") -def pa_source_scalar(pa_type): +def source_scalar(pa_type): if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type): - return pa.scalar(1, type=pa_type) + pa_scalar = pa.scalar(1, type=pa_type) elif pa.types.is_string(pa_type): - return pa.scalar("a", type=pa_type) + pa_scalar = pa.scalar("a", type=pa_type) elif pa.types.is_boolean(pa_type): - return pa.scalar(False, type=pa_type) + pa_scalar = pa.scalar(False, type=pa_type) elif pa.types.is_list(pa_type): # TODO: Longer list? - return pa.scalar([1], type=pa_type) + pa_scalar = pa.scalar([1], type=pa_type) elif pa.types.is_struct(pa_type): - return pa.scalar({"v": 1}, type=pa_type) - raise ValueError("Unsupported type") - - -@pytest.fixture(scope="module") -def source_scalar(pa_source_scalar): - return plc.interop.from_arrow(pa_source_scalar) - - -@pytest.fixture(scope="module") -def pa_mask(pa_target_column): - return pa.array([True, False] * (len(pa_target_column) // 2)) + pa_scalar = pa.scalar({"v": 1}, type=pa_type) + else: + raise ValueError("Unsupported type") + return pa_scalar, plc.interop.from_arrow(pa_scalar) @pytest.fixture(scope="module") -def mask(pa_mask): - return plc.interop.from_arrow(pa_mask) +def mask(target_column): + pa_target_column, _ = target_column + pa_mask = pa.array([True, False] * (len(pa_target_column) // 2)) + return pa_mask, plc.interop.from_arrow(pa_mask) -def test_gather(target_table, pa_target_table, index_column, pa_index_column): +def test_gather(target_table, index_column): + pa_target_table, plc_target_table = target_table + pa_index_column, plc_index_column = index_column result = plc.copying.gather( - target_table, - index_column, + plc_target_table, + plc_index_column, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) expected = pa_target_table.take(pa_index_column) @@ -142,10 +125,11 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column): def test_gather_map_has_nulls(target_table): + _, plc_target_table = target_table gather_map = plc.interop.from_arrow(pa.array([0, 1, None])) with cudf_raises(ValueError): plc.copying.gather( - target_table, + plc_target_table, gather_map, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) @@ -185,16 +169,16 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): def test_scatter_table( source_table, - pa_source_table, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_table, plc_source_table = source_table + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - source_table, - index_column, - target_table, + plc_source_table, + plc_index_column, + plc_target_table, ) if pa.types.is_list( @@ -247,68 +231,80 @@ def test_scatter_table_num_col_mismatch( source_table, index_column, target_table ): # Number of columns in source and target must match. + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - plc.Table(source_table.columns()[:2]), - index_column, - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_index_column, + plc_target_table, ) def test_scatter_table_num_row_mismatch(source_table, target_table): # Number of rows in source and scatter map must match. + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, + plc_source_table, plc.interop.from_arrow( - pa.array(range(source_table.num_rows() * 2)) + pa.array(range(plc_source_table.num_rows() * 2)) ), - target_table, + plc_target_table, ) def test_scatter_table_map_has_nulls(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - source_table, - plc.interop.from_arrow(pa.array([None] * source_table.num_rows())), - target_table, + plc_source_table, + plc.interop.from_arrow( + pa.array([None] * plc_source_table.num_rows()) + ), + plc_target_table, ) def test_scatter_table_type_mismatch(source_table, index_column, target_table): + _, plc_source_table = source_table + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - pa_array = pa.array([True] * source_table.num_rows()) + pa_array = pa.array([True] * plc_source_table.num_rows()) else: - pa_array = pa.array([1] * source_table.num_rows()) - ncol = source_table.num_columns() + pa_array = pa.array([1] * plc_source_table.num_rows()) + ncol = plc_source_table.num_columns() pa_table = pa.table([pa_array] * ncol, [""] * ncol) plc.copying.scatter( plc.interop.from_arrow(pa_table), - index_column, - target_table, + plc_index_column, + plc_target_table, ) def test_scatter_scalars( source_scalar, - pa_source_scalar, index_column, - pa_index_column, target_table, - pa_target_table, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_index_column, plc_index_column = index_column + pa_target_table, plc_target_table = target_table result = plc.copying.scatter( - [source_scalar] * target_table.num_columns(), - index_column, - target_table, + [plc_source_scalar] * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert( _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows) ), @@ -321,85 +317,103 @@ def test_scatter_scalars( def test_scatter_scalars_num_scalars_mismatch( source_scalar, index_column, target_table ): + _, plc_source_scalar = source_scalar + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * (target_table.num_columns() - 1), - index_column, - target_table, + [plc_source_scalar] * (plc_target_table.num_columns() - 1), + plc_index_column, + plc_target_table, ) def test_scatter_scalars_map_has_nulls(source_scalar, target_table): + _, plc_source_scalar = source_scalar + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.scatter( - [source_scalar] * target_table.num_columns(), + [plc_source_scalar] * plc_target_table.num_columns(), plc.interop.from_arrow(pa.array([None, None])), - target_table, + plc_target_table, ) def test_scatter_scalars_type_mismatch(index_column, target_table): + _, plc_index_column = index_column + _, plc_target_table = target_table with cudf_raises(TypeError): if is_integer( - dtype := target_table.columns()[0].type() + dtype := plc_target_table.columns()[0].type() ) or is_floating(dtype): - source_scalar = [plc.interop.from_arrow(pa.scalar(True))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: - source_scalar = [plc.interop.from_arrow(pa.scalar(1))] + plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] plc.copying.scatter( - source_scalar * target_table.num_columns(), - index_column, - target_table, + plc_source_scalar * plc_target_table.num_columns(), + plc_index_column, + plc_target_table, ) def test_empty_like_column(input_column): - result = plc.copying.empty_like(input_column) - assert result.type() == input_column.type() + _, plc_input_column = input_column + result = plc.copying.empty_like(plc_input_column) + assert result.type() == plc_input_column.type() def test_empty_like_table(source_table): - result = plc.copying.empty_like(source_table) - assert result.num_columns() == source_table.num_columns() - for icol, rcol in zip(source_table.columns(), result.columns()): + _, plc_source_table = source_table + result = plc.copying.empty_like(plc_source_table) + assert result.num_columns() == plc_source_table.num_columns() + for icol, rcol in zip(plc_source_table.columns(), result.columns()): assert rcol.type() == icol.type() @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): - if is_fixed_width(input_column.type()): + _, plc_input_column = input_column + if is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( - input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size + plc_input_column, + plc.copying.MaskAllocationPolicy.RETAIN, + size=size, + ) + assert result.type() == plc_input_column.type() + assert result.size() == ( + plc_input_column.size() if size is None else size ) - assert result.type() == input_column.type() - assert result.size() == (input_column.size() if size is None else size) else: with pytest.raises(TypeError): plc.copying.allocate_like( - input_column, + plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size, ) def test_copy_range_in_place( - input_column, pa_input_column, mutable_target_column, pa_target_column + input_column, mutable_target_column, target_column ): + pa_input_column, plc_input_column = input_column + + pa_target_column, _ = target_column + if not is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) else: plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -415,36 +429,40 @@ def test_copy_range_in_place( def test_copy_range_in_place_out_of_bounds( input_column, mutable_target_column ): + _, plc_input_column = input_column + if is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_in_place_different_types(mutable_target_column): if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range_in_place( - input_column, + plc_input_column, mutable_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_in_place_null_mismatch( - pa_input_column, mutable_target_column + input_column, mutable_target_column ): + pa_input_column, _ = input_column + if is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), @@ -462,15 +480,15 @@ def test_copy_range_in_place_null_mismatch( ) -def test_copy_range( - input_column, pa_input_column, target_column, pa_target_column -): - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): +def test_copy_range(input_column, target_column): + pa_input_column, plc_input_column = input_column + pa_target_column, plc_target_column = target_column + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): result = plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) expected = _pyarrow_boolean_mask_scatter_column( @@ -484,137 +502,152 @@ def test_copy_range( else: with pytest.raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) def test_copy_range_out_of_bounds(input_column, target_column): + _, plc_input_column = input_column + _, plc_target_column = target_column with cudf_raises(IndexError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 5, - 5 + input_column.size(), + 5 + plc_input_column.size(), 0, ) def test_copy_range_different_types(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: - input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) + plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.copy_range( - input_column, - target_column, + plc_input_column, + plc_target_column, 0, - input_column.size(), + plc_input_column.size(), 0, ) -def test_shift( - target_column, pa_target_column, source_scalar, pa_source_scalar -): +def test_shift(target_column, source_scalar): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := target_column.type()) or is_string(dtype): - result = plc.copying.shift(target_column, shift, source_scalar) + if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] ) assert_column_eq(expected, result) else: with pytest.raises(TypeError): - plc.copying.shift(target_column, shift, source_scalar) + plc.copying.shift(plc_target_column, shift, source_scalar) def test_shift_type_mismatch(target_column): - if is_integer(dtype := target_column.type()) or is_floating(dtype): + _, plc_target_column = target_column + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) with cudf_raises(TypeError): - plc.copying.shift(target_column, 2, fill_value) + plc.copying.shift(plc_target_column, 2, fill_value) -def test_slice_column(target_column, pa_target_column): +def test_slice_column(target_column): + pa_target_column, plc_target_column = target_column bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_column, bounds) + result = plc.copying.slice(plc_target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], slice_) def test_slice_column_wrong_length(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5))) + plc.copying.slice(plc_target_column, list(range(5))) def test_slice_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.slice(target_column, list(range(5, -1, -1))) + plc.copying.slice(plc_target_column, list(range(5, -1, -1))) def test_slice_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.slice(target_column, list(range(2, 8))) + plc.copying.slice(plc_target_column, list(range(2, 8))) -def test_slice_table(target_table, pa_target_table): +def test_slice_table(target_table): + pa_target_table, plc_target_table = target_table bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] - result = plc.copying.slice(target_table, bounds) + result = plc.copying.slice(plc_target_table, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], slice_) -def test_split_column(target_column, pa_target_column): +def test_split_column(target_column): upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_column, upper_bounds) + pa_target_column, plc_target_column = target_column + result = plc.copying.split(plc_target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], split) def test_split_column_decreasing(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): - plc.copying.split(target_column, list(range(5, -1, -1))) + plc.copying.split(plc_target_column, list(range(5, -1, -1))) def test_split_column_out_of_bounds(target_column): + _, plc_target_column = target_column with cudf_raises(IndexError): - plc.copying.split(target_column, list(range(5, 8))) + plc.copying.split(plc_target_column, list(range(5, 8))) -def test_split_table(target_table, pa_target_table): +def test_split_table(target_table): + pa_target_table, plc_target_table = target_table upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] - result = plc.copying.split(target_table, upper_bounds) + result = plc.copying.split(plc_target_table, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_table_eq(pa_target_table[lb:ub], split) -def test_copy_if_else_column_column( - target_column, pa_target_column, pa_source_scalar, mask, pa_mask -): +def test_copy_if_else_column_column(target_column, mask, source_scalar): + pa_target_column, plc_target_column = target_column + pa_source_scalar, _ = source_scalar + pa_mask, plc_mask = mask + pa_other_column = pa.concat_arrays( [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]] ) - other_column = plc.interop.from_arrow(pa_other_column) + plc_other_column = plc.interop.from_arrow(pa_other_column) result = plc.copying.copy_if_else( - target_column, - other_column, - mask, + plc_target_column, + plc_other_column, + plc_mask, ) expected = pc.if_else( @@ -626,46 +659,51 @@ def test_copy_if_else_column_column( def test_copy_if_else_wrong_type(target_column, mask): - if is_integer(dtype := target_column.type()) or is_floating(dtype): - input_column = plc.interop.from_arrow( - pa.array(["a"] * target_column.size()) + _, plc_target_column = target_column + _, plc_mask = mask + if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + plc_input_column = plc.interop.from_arrow( + pa.array(["a"] * plc_target_column.size()) ) else: - input_column = plc.interop.from_arrow( - pa.array([1] * target_column.size()) + plc_input_column = plc.interop.from_arrow( + pa.array([1] * plc_target_column.size()) ) with cudf_raises(TypeError): - plc.copying.copy_if_else(input_column, target_column, mask) + plc.copying.copy_if_else(plc_input_column, plc_target_column, plc_mask) def test_copy_if_else_wrong_type_mask(target_column): + _, plc_target_column = target_column with cudf_raises(TypeError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([1.0, 2.0] * (target_column.size() // 2)) + pa.array([1.0, 2.0] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( plc.interop.from_arrow(pa.array([1])), - target_column, + plc_target_column, plc.interop.from_arrow( - pa.array([True, False] * (target_column.size() // 2)) + pa.array([True, False] * (plc_target_column.size() // 2)) ), ) def test_copy_if_else_wrong_size_mask(target_column): + _, plc_target_column = target_column with cudf_raises(ValueError): plc.copying.copy_if_else( - target_column, - target_column, + plc_target_column, + plc_target_column, plc.interop.from_arrow(pa.array([True])), ) @@ -673,21 +711,21 @@ def test_copy_if_else_wrong_size_mask(target_column): @pytest.mark.parametrize("array_left", [True, False]) def test_copy_if_else_column_scalar( target_column, - pa_target_column, source_scalar, - pa_source_scalar, array_left, mask, - pa_mask, ): + pa_target_column, plc_target_column = target_column + pa_source_scalar, plc_source_scalar = source_scalar + pa_mask, plc_mask = mask args = ( - (target_column, source_scalar) + (plc_target_column, plc_source_scalar) if array_left - else (source_scalar, target_column) + else (plc_source_scalar, plc_target_column) ) result = plc.copying.copy_if_else( *args, - mask, + plc_mask, ) pa_args = ( @@ -704,16 +742,17 @@ def test_copy_if_else_column_scalar( def test_boolean_mask_scatter_from_table( source_table, - pa_source_table, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_table, plc_source_table = source_table + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask + result = plc.copying.boolean_mask_scatter( - source_table, - target_table, - mask, + plc_source_table, + plc_target_table, + plc_mask, ) if pa.types.is_list( @@ -757,28 +796,34 @@ def test_boolean_mask_scatter_from_table( def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 3)), ) def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([True, False] * 2)), ) def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(ValueError): plc.copying.boolean_mask_scatter( - plc.Table(source_table.columns()[:2]), - target_table, + plc.Table(plc_source_table.columns()[:2]), + plc_target_table, plc.interop.from_arrow( pa.array([True, False] * 2 + [False, False]) ), @@ -786,44 +831,48 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): - if is_integer(dtype := target_table.columns()[0].type()) or is_floating( - dtype - ): + _, plc_target_table = target_table + _, plc_mask = mask + if is_integer( + dtype := plc_target_table.columns()[0].type() + ) or is_floating(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - plc.Table([input_column] * 3), target_table, mask + plc.Table([input_column] * 3), plc_target_table, plc_mask ) def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): + _, plc_source_table = source_table + _, plc_target_table = target_table with cudf_raises(TypeError): plc.copying.boolean_mask_scatter( - source_table, - target_table, + plc_source_table, + plc_target_table, plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)), ) def test_boolean_mask_scatter_from_scalars( source_scalar, - pa_source_scalar, target_table, - pa_target_table, mask, - pa_mask, ): + pa_source_scalar, plc_source_scalar = source_scalar + pa_target_table, plc_target_table = target_table + pa_mask, plc_mask = mask result = plc.copying.boolean_mask_scatter( - [source_scalar] * 3, - target_table, - mask, + [plc_source_scalar] * 3, + plc_target_table, + plc_mask, ) expected = _pyarrow_boolean_mask_scatter_table( - [pa_source_scalar] * target_table.num_columns(), + [pa_source_scalar] * plc_target_table.num_columns(), pc.invert(pa_mask), pa_target_table, ) @@ -831,9 +880,10 @@ def test_boolean_mask_scatter_from_scalars( assert_table_eq(expected, result) -def test_get_element(input_column, pa_input_column): +def test_get_element(input_column): index = 1 - result = plc.copying.get_element(input_column, index) + pa_input_column, plc_input_column = input_column + result = plc.copying.get_element(plc_input_column, index) assert ( plc.interop.to_arrow( @@ -844,5 +894,6 @@ def test_get_element(input_column, pa_input_column): def test_get_element_out_of_bounds(input_column): + _, plc_input_column = input_column with cudf_raises(IndexError): - plc.copying.get_element(input_column, 100) + plc.copying.get_element(plc_input_column, 100) diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py index a5d332a7795..13f3b037606 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -19,13 +19,9 @@ @pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) -def pa_col_data(request, numeric_pa_type): - return pa.array(request.param, type=numeric_pa_type) - - -@pytest.fixture(scope="module") -def plc_col_data(pa_col_data): - return plc.interop.from_arrow(pa_col_data) +def col_data(request, numeric_pa_type): + pa_array = pa.array(request.param, type=numeric_pa_type) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -60,7 +56,8 @@ def plc_tbl_data(request): @pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) @pytest.mark.parametrize("exact", [True, False]) -def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): +def test_quantile(col_data, interp_opt, q, exact): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) @@ -210,7 +207,8 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): "q", [[0.1], (0.1,), np.array([0.1])], ) -def test_quantile_q_array_like(pa_col_data, plc_col_data, q): +def test_quantile_q_array_like(col_data, q): + pa_col_data, plc_col_data = col_data ordered_indices = plc.interop.from_arrow( pc.cast(pc.sort_indices(pa_col_data), pa.int32()) ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py index 32d79257f4f..da1157e5832 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py +++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py @@ -10,20 +10,15 @@ @pytest.fixture(scope="module") def reshape_data(): data = [[1, 2, 3], [4, 5, 6]] - return data + arrow_tbl = pa.Table.from_arrays(data, names=["a", "b"]) + return data, plc.interop.from_arrow(arrow_tbl) -@pytest.fixture(scope="module") -def reshape_plc_tbl(reshape_data): - arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"]) - plc_tbl = plc.interop.from_arrow(arrow_tbl) - return plc_tbl - - -def test_interleave_columns(reshape_data, reshape_plc_tbl): +def test_interleave_columns(reshape_data): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.interleave_columns(reshape_plc_tbl) - interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)] + interleaved_data = [pa.array(pair) for pair in zip(*raw_data)] expect = pa.concat_arrays(interleaved_data) @@ -31,10 +26,11 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl): @pytest.mark.parametrize("cnt", [0, 1, 3]) -def test_tile(reshape_data, reshape_plc_tbl, cnt): +def test_tile(reshape_data, cnt): + raw_data, reshape_plc_tbl = reshape_data res = plc.reshape.tile(reshape_plc_tbl, cnt) - tiled_data = [pa.array(col * cnt) for col in reshape_data] + tiled_data = [pa.array(col * cnt) for col in raw_data] expect = pa.Table.from_arrays( tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py index 818d6e6e72a..c4e437fe5d9 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py @@ -8,39 +8,38 @@ @pytest.fixture(scope="module") -def pa_data(): - data = [ - "leopard", - "Golden Eagle", - "SNAKE", - "", - "!A", - "hello World", - "A B C", - "#", - "AƻB", - "Ⓑⓖ", - "Art of War", - "The quick bRoWn fox juMps over the laze DOG", - '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', - "accénted", - None, - ] - return pa.array(data) - - -@pytest.fixture(scope="module") -def plc_data(pa_data): - return plc.interop.from_arrow(pa_data) +def str_data(): + pa_data = pa.array( + [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + None, + ] + ) + return pa_data, plc.interop.from_arrow(pa_data) -def test_capitalize(plc_data, pa_data): +def test_capitalize(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.capitalize(plc_data) expected = pa.compute.utf8_capitalize(pa_data) assert_column_eq(expected, got) -def test_title(plc_data, pa_data): +def test_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.title( plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES ) @@ -48,7 +47,8 @@ def test_title(plc_data, pa_data): assert_column_eq(expected, got) -def test_is_title(plc_data, pa_data): +def test_is_title(str_data): + pa_data, plc_data = str_data got = plc.strings.capitalize.is_title(plc_data) expected = pa.compute.utf8_is_title(pa_data) assert_column_eq(expected, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py index 8cdb6f7c521..fc8c6656b5d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py @@ -8,15 +8,11 @@ @pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture( @@ -45,9 +41,8 @@ def plc_target_pat(pa_target_scalar): return prog -def test_contains_re( - pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat -): +def test_contains_re(target_col, pa_target_scalar, plc_target_pat): + pa_target_col, plc_target_col = target_col got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat) expected = pa.compute.match_substring_regex( pa_target_col, pa_target_scalar.as_py() diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py index 44900044184..95a1a3cf731 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py +++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py @@ -8,8 +8,8 @@ @pytest.fixture(scope="module") -def pa_data_col(): - return pa.array( +def data_col(): + pa_array = pa.array( [ "abc123", "ABC123", @@ -53,16 +53,12 @@ def pa_data_col(): None, ] ) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(scope="module") -def plc_data_col(pa_data_col): - return plc.interop.from_arrow(pa_data_col) - - -@pytest.fixture(scope="module") -def pa_target_col(): - return pa.array( +def target_col(): + pa_array = pa.array( [ "a", "B", @@ -106,24 +102,18 @@ def pa_target_col(): None, # ends_with ] ) - - -@pytest.fixture(scope="module") -def plc_target_col(pa_target_col): - return plc.interop.from_arrow(pa_target_col) + return pa_array, plc.interop.from_arrow(pa_array) @pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module") -def pa_target_scalar(request): - return pa.scalar(request.param, type=pa.string()) - - -@pytest.fixture(scope="module") -def plc_target_scalar(pa_target_scalar): - return plc.interop.from_arrow(pa_target_scalar) +def target_scalar(request): + pa_scalar = pa.scalar(request.param, type=pa.string()) + return pa_scalar, plc.interop.from_arrow(pa_scalar) -def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_find(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1) expected = pa.array( @@ -161,7 +151,9 @@ def handle_none(st, target): return expected -def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): +def test_find_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = pa.array( [ elem.find(target) if not (elem is None or target is None) else None @@ -177,7 +169,9 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col): assert_column_eq(expected, got) -def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): +def test_rfind(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1) @@ -195,9 +189,9 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar): assert_column_eq(expected, got) -def test_contains( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_contains(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.contains(plc_data_col, plc_target_scalar) @@ -214,9 +208,9 @@ def test_contains( assert_column_eq(expected, got) -def test_contains_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_contains_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: target in st ) @@ -224,18 +218,18 @@ def test_contains_column( assert_column_eq(expected, got) -def test_starts_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_starts_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar) expected = pa.compute.starts_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_starts_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_starts_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.startswith(target) ) @@ -243,18 +237,18 @@ def test_starts_with_column( assert_column_eq(expected, got) -def test_ends_with( - pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar -): +def test_ends_with(data_col, target_scalar): + pa_data_col, plc_data_col = data_col + pa_target_scalar, plc_target_scalar = target_scalar py_target = pa_target_scalar.as_py() got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar) expected = pa.compute.ends_with(pa_data_col, py_target) assert_column_eq(expected, got) -def test_ends_with_column( - pa_data_col, pa_target_col, plc_data_col, plc_target_col -): +def test_ends_with_column(data_col, target_col): + pa_data_col, plc_data_col = data_col + pa_target_col, plc_target_col = target_col expected = colwise_apply( pa_data_col, pa_target_col, lambda st, target: st.endswith(target) ) From 22ac996dea6f297736c9fd8cda735c0e7a5dbe43 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 11 Jun 2024 16:30:09 +0100 Subject: [PATCH 45/48] Remove `Scalar` container type from polars interpreter (#15953) Now we always return columns and, where usage of a scalar might be correct (for example broadcasting in binops), we check if the column is "actually" a scalar and extract it. This is slightly annoying because we have to introspect things in various places. But without changing libcudf to treat length-1 columns as always broadcastable like scalars this is, I think, the best we can do. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15953 --- python/cudf_polars/cudf_polars/__init__.py | 8 +- .../cudf_polars/containers/__init__.py | 3 +- .../cudf_polars/containers/column.py | 28 ++++- .../cudf_polars/containers/dataframe.py | 6 +- .../cudf_polars/containers/scalar.py | 23 ---- python/cudf_polars/cudf_polars/dsl/expr.py | 114 +++++++++++------- python/cudf_polars/cudf_polars/dsl/ir.py | 75 +++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- .../cudf_polars/cudf_polars/utils/sorting.py | 2 +- python/cudf_polars/pyproject.toml | 3 - python/cudf_polars/tests/utils/__init__.py | 6 + .../cudf_polars/tests/utils/test_broadcast.py | 74 ++++++++++++ 12 files changed, 249 insertions(+), 97 deletions(-) delete mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py create mode 100644 python/cudf_polars/tests/utils/__init__.py create mode 100644 python/cudf_polars/tests/utils/test_broadcast.py diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index b19a282129a..41d06f8631b 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,7 +10,13 @@ from __future__ import annotations +from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir -__all__: list[str] = ["execute_with_cudf", "translate_ir"] +__all__: list[str] = [ + "execute_with_cudf", + "translate_ir", + "__git_commit__", + "__version__", +] diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index ee69e748eb5..06bb08953f1 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,8 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"] +__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] from cudf_polars.containers.column import Column, NamedColumn from cudf_polars.containers.dataframe import DataFrame -from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 575d15d3ece..156dd395d64 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -17,12 +17,13 @@ class Column: - """A column with sortedness metadata.""" + """An immutable column with sortedness metadata.""" obj: plc.Column is_sorted: plc.types.Sorted order: plc.types.Order null_order: plc.types.NullOrder + is_scalar: bool def __init__( self, @@ -33,10 +34,33 @@ def __init__( null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, ): self.obj = column + self.is_scalar = self.obj.size() == 1 + if self.obj.size() <= 1: + is_sorted = plc.types.Sorted.YES self.is_sorted = is_sorted self.order = order self.null_order = null_order + @functools.cached_property + def obj_scalar(self) -> plc.Scalar: + """ + A copy of the column object as a pylibcudf Scalar. + + Returns + ------- + pylibcudf Scalar object. + + Raises + ------ + ValueError + If the column is not length-1. + """ + if not self.is_scalar: + raise ValueError( + f"Cannot convert a column of length {self.obj.size()} to scalar" + ) + return plc.copying.get_element(self.obj, 0) + def sorted_like(self, like: Column, /) -> Self: """ Copy sortedness properties from a column onto self. @@ -81,6 +105,8 @@ def set_sorted( ------- Self with metadata set. """ + if self.obj.size() <= 1: + is_sorted = plc.types.Sorted.YES self.is_sorted = is_sorted self.order = order self.null_order = null_order diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ac7e748095e..7039fcaf077 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -32,7 +32,7 @@ class DataFrame: """A representation of a dataframe.""" columns: list[NamedColumn] - table: plc.Table | None + table: plc.Table def __init__(self, columns: Sequence[NamedColumn]) -> None: self.columns = list(columns) @@ -41,7 +41,7 @@ def __init__(self, columns: Sequence[NamedColumn]) -> None: def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)(self.columns) + return type(self)([c.copy() for c in self.columns]) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -70,8 +70,6 @@ def num_columns(self) -> int: @cached_property def num_rows(self) -> int: """Number of rows.""" - if self.table is None: - raise ValueError("Number of rows of frame with scalars makes no sense") return self.table.num_rows() @classmethod diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py deleted file mode 100644 index fc97d0fd9c2..00000000000 --- a/python/cudf_polars/cudf_polars/containers/scalar.py +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 - -"""A scalar, with some properties.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import cudf._lib.pylibcudf as plc - -__all__: list[str] = ["Scalar"] - - -class Scalar: - """A scalar, and a name.""" - - __slots__ = ("obj", "name") - obj: plc.Scalar - - def __init__(self, scalar: plc.Scalar): - self.obj = scalar diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 6d9435ce373..a81cdcbf0c3 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -5,7 +5,7 @@ """ DSL nodes for the polars expression language. -An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`. +An expression node is a function, `DataFrame -> Column`. The evaluation context is provided by a LogicalPlan node, and can affect the evaluation rule as well as providing the dataframe input. @@ -26,7 +26,7 @@ import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column, NamedColumn, Scalar +from cudf_polars.containers import Column, NamedColumn from cudf_polars.utils import sorting if TYPE_CHECKING: @@ -165,7 +165,7 @@ def do_evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> Column: # TODO: return type is a lie for Literal + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -187,8 +187,7 @@ def do_evaluate( Returns ------- - Column representing the evaluation of the expression (or maybe - a scalar). + Column representing the evaluation of the expression. Raises ------ @@ -205,7 +204,7 @@ def evaluate( *, context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, - ) -> Column: # TODO: return type is a lie for Literal + ) -> Column: """ Evaluate this expression given a dataframe for context. @@ -222,23 +221,13 @@ def evaluate( Notes ----- - Individual subclasses should implement :meth:`do_allocate`, + Individual subclasses should implement :meth:`do_evaluate`, this method provides logic to handle lookups in the substitution mapping. - The typed return value of :class:`Column` is not true when - evaluating :class:`Literal` nodes (which instead produce - :class:`Scalar` objects). However, these duck-type to having a - pylibcudf container object inside them, and usually they end - up appearing in binary expressions which pylibcudf handles - appropriately since there are overloads for (column, scalar) - pairs. We don't have to handle (scalar, scalar) in binops - since the polars optimizer has a constant-folding pass. - Returns ------- - Column representing the evaluation of the expression (or maybe - a scalar). + Column representing the evaluation of the expression. Raises ------ @@ -319,24 +308,35 @@ def evaluate( context: ExecutionContext = ExecutionContext.FRAME, mapping: Mapping[Expr, Column] | None = None, ) -> NamedColumn: - """Evaluate this expression given a dataframe for context.""" + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame providing context + context + Execution context + mapping + Substitution mapping + + Returns + ------- + NamedColumn attaching a name to an evaluated Column + + See Also + -------- + :meth:`Expr.evaluate` for details, this function just adds the + name to a column produced from an expression. + """ obj = self.value.evaluate(df, context=context, mapping=mapping) - if isinstance(obj, Scalar): - return NamedColumn( - plc.Column.from_scalar(obj.obj, 1), - self.name, - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.BEFORE, - ) - else: - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, - ) + return NamedColumn( + obj.obj, + self.name, + is_sorted=obj.is_sorted, + order=obj.order, + null_order=obj.null_order, + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -363,7 +363,7 @@ def do_evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" # datatype of pyarrow scalar is correct by construction. - return Scalar(plc.interop.from_arrow(self.value)) # type: ignore + return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) class Col(Expr): @@ -402,8 +402,14 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - # TODO: type is wrong, and dtype - return df.num_rows # type: ignore + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -664,10 +670,24 @@ def do_evaluate( return Column(plc.strings.case.to_upper(column.obj)) elif self.name == pl_expr.StringFunction.EndsWith: column, suffix = columns - return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) + return Column( + plc.strings.find.ends_with( + column.obj, + suffix.obj_scalar + if column.obj.size() != suffix.obj.size() and suffix.is_scalar + else suffix.obj, + ) + ) elif self.name == pl_expr.StringFunction.StartsWith: - column, suffix = columns - return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) + column, prefix = columns + return Column( + plc.strings.find.starts_with( + column.obj, + prefix.obj_scalar + if column.obj.size() != prefix.obj.size() and prefix.is_scalar + else prefix.obj, + ) + ) else: raise NotImplementedError(f"StringFunction {self.name}") @@ -875,9 +895,6 @@ def __init__( self, dtype: plc.DataType, name: str, options: Any, value: Expr ) -> None: super().__init__(dtype) - # TODO: fix polars name - if name == "nunique": - name = "n_unique" self.name = name self.options = options self.children = (value,) @@ -1092,8 +1109,15 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ) + lop = left.obj + rop = right.obj + if left.obj.size() != right.obj.size(): + if left.is_scalar: + lop = left.obj_scalar + elif right.is_scalar: + rop = right.obj_scalar return Column( - plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), + plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), ) def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 665bbe5be41..0a6deb5698c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -63,26 +63,58 @@ def broadcast( *columns: NamedColumn, target_length: int | None = None ) -> list[NamedColumn]: - lengths = {column.obj.size() for column in columns} - if len(lengths - {1}) > 1: - raise RuntimeError("Mismatching column lengths") + """ + Broadcast a sequence of columns to a common length. + + Parameters + ---------- + columns + Columns to broadcast. + target_length + Optional length to broadcast to. If not provided, uses the + non-unit length of existing columns. + + Returns + ------- + List of broadcasted columns all of the same length. + + Raises + ------ + RuntimeError + If broadcasting is not possible. + + Notes + ----- + In evaluation of a set of expressions, polars type-puns length-1 + columns with scalars. When we insert these into a DataFrame + object, we need to ensure they are of equal length. This function + takes some columns, some of which may be length-1 and ensures that + all length-1 columns are broadcast to the length of the others. + + Broadcasting is only possible if the set of lengths of the input + columns is a subset of ``{1, n}`` for some (fixed) ``n``. If + ``target_length`` is provided and not all columns are length-1 + (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. + """ + lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: return list(columns) nrows = target_length - elif len(lengths) == 1: - if target_length is not None: - assert target_length in lengths - return list(columns) else: - (nrows,) = lengths - {1} - if target_length is not None: - assert target_length == nrows + try: + (nrows,) = lengths.difference([1]) + except ValueError as e: + raise RuntimeError("Mismatching column lengths") from e + if target_length is not None and nrows != target_length: + raise RuntimeError( + f"Cannot broadcast columns of length {nrows=} to {target_length=}" + ) return [ column if column.obj.size() != 1 else NamedColumn( - plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows), + plc.Column.from_scalar(column.obj_scalar, nrows), column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, @@ -279,12 +311,16 @@ class Select(IR): """Input dataframe.""" expr: list[expr.NamedExpr] """List of expressions to evaluate to form the new dataframe.""" + should_broadcast: bool + """Should columns be broadcast?""" def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) # Handle any broadcasting - columns = broadcast(*(e.evaluate(df) for e in self.expr)) + columns = [e.evaluate(df) for e in self.expr] + if self.should_broadcast: + columns = broadcast(*columns) return DataFrame(columns) @@ -587,15 +623,24 @@ class HStack(IR): """Input dataframe.""" columns: list[expr.NamedExpr] """List of expressions to produce new columns.""" + should_broadcast: bool + """Should columns be broadcast?""" def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) columns = [c.evaluate(df) for c in self.columns] - # TODO: a bit of a hack, should inherit the should_broadcast - # property of polars' ProjectionOptions on the hstack node. - if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns): + if self.should_broadcast: columns = broadcast(*columns, target_length=df.num_rows) + else: + # Polars ensures this is true, but let's make sure nothing + # went wrong. In this case, the parent node is a + # guaranteed to be a Select which will take care of making + # sure that everything is the same length. The result + # table that might have mismatching column lengths will + # never be turned into a pylibcudf Table with all columns + # by the Select, which is why this is safe. + assert all(e.name.startswith("__POLARS_CSER_0x") for e in self.columns) return df.with_columns(columns) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 38107023365..adde3b1a9dc 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -122,7 +122,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, exprs) + return ir.Select(schema, inp, exprs, node.should_broadcast) @_translate_ir.register @@ -166,7 +166,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs) + return ir.HStack(schema, inp, exprs, node.should_broadcast) @_translate_ir.register diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index d35459db20d..24fd449dd88 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -30,7 +30,7 @@ def sort_order( Returns ------- - tuple of column_order and null_precendence + tuple of column_order and null_precedence suitable for passing to sort routines """ # Mimicking polars broadcast handling of descending diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 2faf8c3193f..11178a3be74 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -49,9 +49,6 @@ license-files = ["LICENSE"] [tool.setuptools.dynamic] version = {file = "cudf_polars/VERSION"} -[tool.setuptools.packages.find] -exclude = ["*tests*"] - [tool.pytest.ini_options] xfail_strict = true diff --git a/python/cudf_polars/tests/utils/__init__.py b/python/cudf_polars/tests/utils/__init__.py new file mode 100644 index 00000000000..4611d642f14 --- /dev/null +++ b/python/cudf_polars/tests/utils/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py new file mode 100644 index 00000000000..69ad1e519e2 --- /dev/null +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import NamedColumn +from cudf_polars.dsl.ir import broadcast + + +@pytest.mark.parametrize("target", [4, None]) +def test_broadcast_all_scalar(target): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + result = broadcast(*columns, target_length=target) + expected = 1 if target is None else target + + assert all(column.obj.size() == expected for column in result) + + +def test_invalid_target_length(): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + with pytest.raises(RuntimeError): + _ = broadcast(*columns, target_length=8) + + +def test_broadcast_mismatching_column_lengths(): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID + ), + f"col{i}", + ) + for i in range(3) + ] + with pytest.raises(RuntimeError): + _ = broadcast(*columns) + + +@pytest.mark.parametrize("nrows", [0, 5]) +def test_broadcast_with_scalars(nrows): + columns = [ + NamedColumn( + plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), + nrows if i == 0 else 1, + plc.MaskState.ALL_VALID, + ), + f"col{i}", + ) + for i in range(3) + ] + + result = broadcast(*columns) + assert all(column.obj.size() == nrows for column in result) From 8efa64ea61905969423bbfcc11353817c7cc1bca Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 11 Jun 2024 11:31:20 -0500 Subject: [PATCH 46/48] Fix `dask_cudf.read_parquet` regression for legacy timestamp data (#15929) cudf does not currently support timezone-aware datetime columns. For example: ```python pdf = pd.DataFrame( { "time": pd.to_datetime( ["1996-01-02", "1996-12-01"], utc=True, ), "x": [1, 2], } ) cudf.DataFrame.from_pandas(pdf) ``` ``` NotImplementedError: cuDF does not yet support timezone-aware datetimes ``` However, `cudf.read_parquet` **does** allow you to read this same data from a Parquet file. This PR adds a simple fix to allow the same data to be read with `dask_cudf`. The dask_cudf version was previously "broken" because it relies on upstream pyarrow logic to construct `meta` as a pandas DataFrame (and then we just convert `meta` from pandas to cudf). As illustrated in the example above, this direct conversion is not allowed when one or more columns contain timezone information. **Important Context** The actual motivation for this PR is to fix a **regression** in 24.06+ for older parquet files containing "legacy" timestamp types (e.g. `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS`). In `pyarrow 14.0.2` (used by cudf-24.04), these legacy types were not automatically translated to timezone-aware dtypes by pyarrow. In `pyarrow 16.1.0` (used by cudf-24.06+), the legacy types **ARE** automatically translated. Therefore, in moving from cudf-24.04 to cudf-24.06+, some `dask_cudf` users will find that they can no longer read the same parquet file containing legacy timestamp data. I'm not entirely sure if cudf should always allow users to read Parquet data with timezone-aware dtypes (e.g. if the timezone is **not** utc), but it definitely makes sense for cudf to ignore automatic/unnecessary timezone translations. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15929 --- python/dask_cudf/dask_cudf/io/parquet.py | 5 +++++ python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 9 ++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index fc962670c47..ba8b1e89721 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -6,6 +6,7 @@ from io import BufferedWriter, BytesIO, IOBase import numpy as np +import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq from dask import dataframe as dd @@ -41,6 +42,10 @@ def _create_dd_meta(cls, dataset_info, **kwargs): meta_pd = super()._create_dd_meta(dataset_info, **kwargs) # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) meta_cudf = cudf.from_pandas(meta_pd) # Re-set "object" dtypes to align with pa schema diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index f3e3911e6c7..620a917109e 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -610,9 +610,8 @@ def test_timezone_column(tmpdir): } ) pdf.to_parquet(path) + + # Check that `cudf` and `dask_cudf` results match got = dask_cudf.read_parquet(path) - # cudf.read_parquet does not support reading timezone aware types yet - assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC") - got["time"] = got["time"].astype("datetime64[ns]") - expected = cudf.read_parquet(path) - dd.assert_eq(got, expected) + expect = cudf.read_parquet(path) + dd.assert_eq(got, expect) From d844d670dfbfcbaeb673253f762bed7fbebf6c86 Mon Sep 17 00:00:00 2001 From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:05:01 -0400 Subject: [PATCH 47/48] Project automation bug fixes (#15971) ## Description This PR resolves two bugs in the recent pr #15945 ## external issue labeling Recent runs show that it is labeling [issues created](https://github.com/rapidsai/cudf/issues/15967) by team members as `External` Using graphQL to explore the authorAssociation shows `"authorAssociation": "MEMBER"` - I've updated the permissions to be specific to the job in an attempt to ensure that we have the permissions we need. Testing this action in personal repos shows it works as expected so not 100% on what's going on. A PR was also unable to run due to the token only having read permissions, so hopefully this is a two birds one stone fix. It may be beneficial to re-run https://github.com/rapidsai/cudf/actions/runs/9462546964/job/26065765728 with debug mode on to see if `author_association` is different to the action (which would be concerning) *edit test* ## project automation This fixes the workflow incorrectly calling my personal workflows for testing. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [ ] ~New or existing tests cover these changes.~ - [ ] ~The documentation is up to date with these changes.~ --- .github/workflows/external_issue_labeler.yml | 25 +++++++++++-------- .../workflows/pr_issue_status_automation.yml | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml index e6d987e9f34..81bc9b18296 100644 --- a/.github/workflows/external_issue_labeler.yml +++ b/.github/workflows/external_issue_labeler.yml @@ -20,36 +20,41 @@ on: types: - opened - pull_request: + pull_request_target: types: - opened env: GITHUB_TOKEN: ${{ github.token }} -permissions: - issues: write - pull-requests: write - jobs: Label-Issue: runs-on: ubuntu-latest - # Only run if the issue author is not part of RAPIDS - if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} + permissions: + issues: write + if: github.event_name == 'issues' steps: - name: add-external-labels + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}} run: | + echo ${{ github.event.issue.author_association }} issue_url=${{ github.event.issue.html_url }} gh issue edit ${issue_url} --add-label "External" continue-on-error: true Label-PR: runs-on: ubuntu-latest - # Only run if the issue author is not part of RAPIDS - if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} + permissions: + pull-requests: write + issues: write + if: github.event_name == 'pull_request_target' steps: - name: add-external-labels + # Only run if the issue author is not part of RAPIDS + if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}} run: | + echo ${{ github.event.pull_request.author_association }} pr_url=${{ github.event.pull_request.html_url }} gh issue edit ${pr_url} --add-label "External" - continue-on-error: true + continue-on-error: true diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index aaece1bfa3e..837963c3286 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 if: github.event.pull_request.state == 'open' needs: get-project-id with: From dfa79d457138dcb9a70410e06c77c45a63ae0b25 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:58:06 -0400 Subject: [PATCH 48/48] Add a developer check for proxy objects (#15956) Closes #15864 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15956 --- docs/cudf/source/developer_guide/cudf_pandas.md | 9 +++++++++ python/cudf/cudf/pandas/__init__.py | 5 +++-- python/cudf/cudf/pandas/fast_slow_proxy.py | 14 ++++++++++++++ .../cudf/cudf_pandas_tests/test_cudf_pandas.py | 16 +++++++++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md index 827ba18a4a4..a8a6d81d6fb 100644 --- a/docs/cudf/source/developer_guide/cudf_pandas.md +++ b/docs/cudf/source/developer_guide/cudf_pandas.md @@ -20,6 +20,7 @@ The "wrapped" types/classes are the Pandas and cuDF specific types that have bee Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively. In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object. Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes. +To check if an object is a proxy type, we can use `cudf.pandas.is_proxy_object`. ```python import cudf.pandas cudf.pandas.install() @@ -31,6 +32,14 @@ Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas s1 = cudf.Series([1,2]) s2 = pd.Series([1,2]) s3 = xpd.Series([1,2]) + + from cudf.pandas import is_proxy_object + + is_proxy_object(s1) # returns False + + is_proxy_object(s2) # returns False + + is_proxy_object(s3) # returns True ``` ```{note} diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index f2e855ae55c..5b3785531d3 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -1,11 +1,12 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from .fast_slow_proxy import is_proxy_object from .magics import load_ipython_extension from .profiler import Profiler -__all__ = ["Profiler", "load_ipython_extension", "install"] +__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"] LOADED = False diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 5f4cf2e6cc6..128913e5746 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1185,6 +1185,20 @@ def _replace_closurevars( ) +def is_proxy_object(obj: Any) -> bool: + """Determine if an object is proxy object + + Parameters + ---------- + obj : object + Any python object. + + """ + if _FastSlowProxyMeta in type(type(obj)).__mro__: + return True + return False + + NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 72e9ad5fca3..515a4714a5a 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -20,7 +20,7 @@ from pytz import utc from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable +from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object if not LOADED: raise ImportError("These tests must be run with cudf.pandas loaded") @@ -1488,3 +1488,17 @@ def mock_mean_none(self, *args, **kwargs): def test_excelwriter_pathlike(): assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike) + + +def test_is_proxy_object(): + np_arr = np.array([1]) + + s1 = xpd.Series([1]) + s2 = pd.Series([1]) + + np_arr_proxy = s1.to_numpy() + + assert not is_proxy_object(np_arr) + assert is_proxy_object(np_arr_proxy) + assert is_proxy_object(s1) + assert not is_proxy_object(s2)