From 5f45803b2a68b49d330d94e2f701791a7590612a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jun 2024 13:00:12 -0700 Subject: [PATCH] Migrate quantile.pxd to pylibcudf (#15874) xref #15162 Migrate quantile.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15874 --- cpp/src/quantiles/quantiles.cu | 4 +- cpp/tests/quantiles/quantiles_test.cpp | 9 +- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/quantiles.rst | 6 + .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/pylibcudf/quantiles.pxd | 25 ++ python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 152 ++++++++++++ python/cudf/cudf/_lib/quantiles.pyx | 102 ++------ python/cudf/cudf/pylibcudf_tests/conftest.py | 29 +++ .../cudf/pylibcudf_tests/test_quantiles.py | 234 ++++++++++++++++++ 12 files changed, 486 insertions(+), 81 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_quantiles.py diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index c0f536536ce..af3bda2e62e 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -34,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -78,7 +79,8 @@ std::unique_ptr quantiles(table_view const& input, CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER || interp == interpolation::NEAREST, - "multi-column quantiles require a non-arithmetic interpolation strategy."); + "multi-column quantiles require a non-arithmetic interpolation strategy.", + std::invalid_argument); CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row."); diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp index 5b7b6dd2718..b7faa20e8c1 100644 --- a/cpp/tests/quantiles/quantiles_test.cpp +++ b/cpp/tests/quantiles/quantiles_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include +#include + template struct QuantilesTest : public cudf::test::BaseFixture {}; @@ -104,9 +106,10 @@ TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation) cudf::test::fixed_width_column_wrapper input_b({}); auto input = cudf::table_view({input_a}); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), std::invalid_argument); - EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error); + EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), + std::invalid_argument); } TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 870ed8856d1..1e03fa80bb5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. join lists merge + quantiles reduce reshape rolling diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst new file mode 100644 index 00000000000..3417c1ff59d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst @@ -0,0 +1,6 @@ +========= +quantiles +========= + +.. automodule:: cudf._lib.pylibcudf.quantiles + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 6beb7b0f506..ed396208f98 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -26,6 +26,7 @@ set(cython_sources join.pyx lists.pyx merge.pyx + quantiles.pyx reduce.pyx replace.pyx reshape.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index b289d112a90..a628ecdb038 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -12,6 +12,7 @@ from . cimport ( join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ __all__ = [ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 2565332f3ed..46d0fe13cd1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -12,6 +12,7 @@ join, lists, merge, + quantiles, reduce, replace, reshape, @@ -48,6 +49,7 @@ "join", "lists", "merge", + "quantiles", "reduce", "replace", "rolling", diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd new file mode 100644 index 00000000000..70ff135ca77 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted + +from .column cimport Column +from .table cimport Table + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = *, + Column ordered_indices = *, + bint exact = * +) + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = *, + sorted is_input_sorted = *, + list column_order = *, + list null_precedence = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx new file mode 100644 index 00000000000..c1f0e30ccd3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view +from cudf._lib.pylibcudf.libcudf.quantiles cimport ( + quantile as cpp_quantile, + quantiles as cpp_quantiles, +) +from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted + +from .column cimport Column +from .table cimport Table +from .types cimport interpolation + + +cpdef Column quantile( + Column input, + vector[double] q, + interpolation interp = interpolation.LINEAR, + Column ordered_indices = None, + bool exact=True +): + """Computes quantiles with interpolation. + + Computes the specified quantiles by interpolating values between which they lie, + using the interpolation strategy specified in interp. + + Parameters + ---------- + input: Column + The Column to calculate quantiles on. + q: array-like that implements buffer-protocol + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.LINEAR + The strategy used to select between values adjacent to a specified quantile. + ordered_indices: Column, default empty column + The column containing the sorted order of input. + + If empty, all input values are used in existing order. + Indices must be in range [0, input.size()), but are not required to be unique. + Values not indexed by this column will be ignored. + exact: bool, default True + Returns doubles if True. Otherwise, returns same type as input + + For details, see :cpp:func:`quantile`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[column] c_result + column_view ordered_indices_view + + if ordered_indices is None: + ordered_indices_view = column_view() + else: + ordered_indices_view = ordered_indices.view() + + with nogil: + c_result = move( + cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table quantiles( + Table input, + vector[double] q, + interpolation interp = interpolation.NEAREST, + sorted is_input_sorted = sorted.NO, + list column_order = None, + list null_precedence = None, +): + """Computes row quantiles with interpolation. + + Computes the specified quantiles by retrieving the row corresponding to the + specified quantiles. In the event a quantile lies in between rows, the specified + interpolation strategy is used to pick between the rows. + + Parameters + ---------- + input: Table + The Table to calculate row quantiles on. + q: array-like + The quantiles to calculate in range [0,1] + interp: Interpolation, default Interpolation.NEAREST + The strategy used to select between values adjacent to a specified quantile. + + Must be a non-arithmetic interpolation strategy + (i.e. one of + {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`}) + is_input_sorted: Sorted, default Sorted.NO + Whether the input table has been pre-sorted or not. + column_order: list, default None + A list of `Order` enums, + indicating the desired sort order for each column. + By default, will sort all columns so that they are in ascending order. + + Ignored if `is_input_sorted` is `Sorted.YES` + null_precedence: list, default None + A list of `NullOrder` enums, + indicating how nulls should be sorted. + By default, will sort all columns so that nulls appear before + all other elements. + + Ignored if `is_input_sorted` is `Sorted.YES` + + For details, see :cpp:func:`quantiles`. + + Returns + ------- + Column + A Column containing specified quantiles, with nulls for indeterminable values + """ + cdef: + unique_ptr[table] c_result + vector[order] column_order_vec + vector[null_order] null_precedence_vec + + if column_order is not None: + column_order_vec = column_order + if null_precedence is not None: + null_precedence_vec = null_precedence + + with nogil: + c_result = move( + cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index 3d20454a7ce..7b50c00919a 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -3,76 +3,43 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.types cimport ( underlying_type_t_interpolation, - underlying_type_t_null_order, - underlying_type_t_order, underlying_type_t_sorted, ) from cudf._lib.types import Interpolation -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.quantiles cimport ( - quantile as cpp_quantile, - quantiles as cpp_quantile_table, -) -from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.pylibcudf.libcudf.types cimport ( - interpolation, - null_order, - order, - sorted, -) -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted +from cudf._lib.utils cimport columns_from_pylibcudf_table + +import cudf._lib.pylibcudf as plc @acquire_spill_lock() def quantile( Column input, - object q, + vector[double] q, str interp, Column ordered_indices, bool exact, - ): - cdef column_view c_input = input.view() - cdef column_view c_ordered_indices = ( - column_view() if ordered_indices is None - else ordered_indices.view() - ) cdef interpolation c_interp = ( Interpolation[interp.upper()] ) - cdef bool c_exact = exact - - cdef vector[double] c_q - c_q.reserve(len(q)) - - for value in q: - c_q.push_back(value) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_quantile( - c_input, - c_q, - c_interp, - c_ordered_indices, - c_exact, - ) + return Column.from_pylibcudf( + plc.quantiles.quantile( + input.to_pylibcudf(mode="read"), + q, + c_interp, + ordered_indices.to_pylibcudf(mode="read"), + exact ) - - return Column.from_unique_ptr(move(c_result)) + ) def quantile_table( @@ -83,42 +50,23 @@ def quantile_table( list column_order, list null_precedence, ): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef vector[double] c_q = q + cdef interpolation c_interp = ( interp ) cdef sorted c_is_input_sorted = ( is_input_sorted ) - cdef vector[order] c_column_order - cdef vector[null_order] c_null_precedence - - c_column_order.reserve(len(column_order)) - c_null_precedence.reserve(len(null_precedence)) - - for value in column_order: - c_column_order.push_back( - ( value) - ) - for value in null_precedence: - c_null_precedence.push_back( - ( value) + return columns_from_pylibcudf_table( + plc.quantiles.quantiles( + plc.Table([ + c.to_pylibcudf(mode="read") for c in source_columns + ]), + q, + c_interp, + c_is_input_sorted, + column_order, + null_precedence ) - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_quantile_table( - c_input, - c_q, - c_interp, - c_is_input_sorted, - c_column_order, - c_null_precedence, - ) - ) - - return columns_from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index 6d8284fb3db..f3c6584ef8c 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -7,6 +7,8 @@ import pyarrow as pa import pytest +import cudf._lib.pylibcudf as plc + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) from utils import DEFAULT_STRUCT_TESTING_TYPE @@ -29,3 +31,30 @@ ) def pa_type(request): return request.param + + +@pytest.fixture( + scope="session", + params=[ + pa.int64(), + pa.float64(), + pa.uint64(), + ], +) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="session", params=[opt for opt in plc.types.Interpolation] +) +def interp_opt(request): + return request.param + + +@pytest.fixture( + scope="session", + params=[opt for opt in plc.types.Sorted], +) +def sorted_opt(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py new file mode 100644 index 00000000000..a5d332a7795 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py @@ -0,0 +1,234 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import cudf._lib.pylibcudf as plc + +# Map pylibcudf interpolation options to pyarrow options +interp_mapping = { + plc.types.Interpolation.LINEAR: "linear", + plc.types.Interpolation.LOWER: "lower", + plc.types.Interpolation.HIGHER: "higher", + plc.types.Interpolation.MIDPOINT: "midpoint", + plc.types.Interpolation.NEAREST: "nearest", +} + + +@pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]) +def pa_col_data(request, numeric_pa_type): + return pa.array(request.param, type=numeric_pa_type) + + +@pytest.fixture(scope="module") +def plc_col_data(pa_col_data): + return plc.interop.from_arrow(pa_col_data) + + +@pytest.fixture( + scope="module", + params=[ + { + "arrays": [[1, 2, 3, 5, 4], [5.0, 6.0, 8.0, 7.0, 9.0]], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.int64()), + ] + ), + }, + { + "arrays": [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2.0, 2.2, 2.3, 2.4, None, None, 3.5, 4.5, 5.5], + ], + "schema": pa.schema( + [ + ("a", pa.int64()), + ("b", pa.float64()), + ] + ), + }, + ], +) +def plc_tbl_data(request): + return plc.interop.from_arrow(pa.Table.from_arrays(**request.param)) + + +@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]]) +@pytest.mark.parametrize("exact", [True, False]) +def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, q, interp_opt, ordered_indices, exact + ) + + pa_interp_opt = interp_mapping[interp_opt] + + if exact: + pa_col_data = pc.cast(pa_col_data, pa.float64()) + + if len(q) > 0: + # pyarrow quantile doesn't support empty q + exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt) + else: + exp = pa.array([], type=pa.float64()) + + if not exact: + exp = pc.cast(exp, pa_col_data.type, safe=False) + + assert_column_eq(exp, res) + + +def _pyarrow_quantiles( + pa_tbl_data, + q, + interp_opt=plc.types.Interpolation.NEAREST, + sorted_opt=plc.types.Sorted.NO, + column_order=None, + null_precedence=None, +): + """ + The pyarrow equivalent of plc.quantiles.quantiles + + Takes the same arguments (except input should be a pyarrow table instead of + of a pylibcudf table) + + NOTE: This function doesn't support having different null precedences because of + a lack of support in pyarrow. + """ + if len(q) > 0: + # pyarrow quantile doesn't support empty q + pa_interp_opt = interp_mapping[interp_opt] + + if sorted_opt == plc.types.Sorted.NO: + order_mapper = { + plc.types.Order.ASCENDING: "ascending", + plc.types.Order.DESCENDING: "descending", + } + if null_precedence is None: + null_precedence = [plc.types.NullOrder.BEFORE] * len( + pa_tbl_data.columns + ) + if column_order is None: + column_order = [plc.types.Order.ASCENDING] * len( + pa_tbl_data.columns + ) + + if not all( + [ + null_prec == null_precedence[0] + for null_prec in null_precedence + ] + ): + raise NotImplementedError( + "Having varying null precendences is not implemented!" + ) + + pa_tbl_data = pa_tbl_data.sort_by( + [ + (name, order_mapper[order]) + for name, order in zip( + pa_tbl_data.column_names, column_order + ) + ], + null_placement="at_start" + if null_precedence[0] == plc.types.NullOrder.BEFORE + else "at_end", + ) + row_idxs = pc.quantile( + np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt + ) + exp = pa_tbl_data.take(row_idxs) + else: + exp = pa.Table.from_arrays( + [[] for _ in range(len(pa_tbl_data.schema))], + schema=pa_tbl_data.schema, + ) + return exp + + +@pytest.mark.parametrize( + "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]] +) +@pytest.mark.parametrize( + "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]] +) +@pytest.mark.parametrize( + "null_precedence", + [ + [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ], +) +def test_quantiles( + plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence +): + if interp_opt in { + plc.types.Interpolation.LINEAR, + plc.types.Interpolation.MIDPOINT, + }: + pytest.skip( + "interp cannot be an arithmetic interpolation strategy for quantiles" + ) + + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + + exp = _pyarrow_quantiles( + pa_tbl_data, + q=q, + interp_opt=interp_opt, + sorted_opt=sorted_opt, + column_order=column_order, + null_precedence=null_precedence, + ) + + res = plc.quantiles.quantiles( + plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence + ) + + assert_table_eq(exp, res) + + +@pytest.mark.parametrize( + "invalid_interp", + [plc.types.Interpolation.LINEAR, plc.types.Interpolation.MIDPOINT], +) +def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp): + with pytest.raises(ValueError): + plc.quantiles.quantiles( + plc_tbl_data, q=np.array([0.1]), interp=invalid_interp + ) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantile_q_array_like(pa_col_data, plc_col_data, q): + ordered_indices = plc.interop.from_arrow( + pc.cast(pc.sort_indices(pa_col_data), pa.int32()) + ) + res = plc.quantiles.quantile( + plc_col_data, + q=q, + ordered_indices=ordered_indices, + ) + exp = pc.quantile(pa_col_data, q=q) + assert_column_eq(exp, res) + + +@pytest.mark.parametrize( + "q", + [[0.1], (0.1,), np.array([0.1])], +) +def test_quantiles_q_array_like(plc_tbl_data, q): + res = plc.quantiles.quantiles(plc_tbl_data, q=q) + pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"]) + exp = _pyarrow_quantiles(pa_tbl_data, q=q) + assert_table_eq(exp, res)