From e42b91bfca834c55f1b1c77bd4d6b1542523fd5e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 25 Sep 2024 16:21:36 -0500 Subject: [PATCH 01/26] Add polars to "all" dependency list. (#16875) This adds Polars to the "all" dependency list, ensuring that devcontainers and developers using the conda environment can use the Polars GPU backend provided by cudf. Authors: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16875 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-125_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 16b3d112992..5a05dfd0530 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -65,6 +65,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.3dev0 - pandoc +- polars>=1.8,<1.9 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<18.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index cce2e0eea84..8490296233d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -63,6 +63,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.3dev0 - pandoc +- polars>=1.8,<1.9 - pre-commit - pyarrow>=14.0.0,<18.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/dependencies.yaml b/dependencies.yaml index 339adbc5ff9..6909eb7168d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -25,6 +25,7 @@ files: - rapids_build_setuptools - run_common - run_cudf + - run_cudf_polars - run_pylibcudf - run_dask_cudf - run_custreamz From c1f377ab911748700c032465d0b237c6a792d984 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:51:44 -0400 Subject: [PATCH 02/26] Migrate ORC reader to pylibcudf (#16042) xref #15162 Authors: - Thomas Li (https://github.com/lithomas1) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16042 --- python/cudf/cudf/_lib/io/utils.pxd | 4 + python/cudf/cudf/_lib/orc.pyx | 313 ++++-------------- python/cudf/cudf/_lib/utils.pxd | 2 +- python/cudf/cudf/_lib/utils.pyx | 8 +- python/cudf/cudf/io/orc.py | 11 +- python/cudf/cudf/tests/test_orc.py | 34 +- python/cudf/cudf/utils/ioutils.py | 4 +- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 2 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 2 +- python/pylibcudf/pylibcudf/io/__init__.py | 2 +- python/pylibcudf/pylibcudf/io/orc.pxd | 50 +++ python/pylibcudf/pylibcudf/io/orc.pyx | 302 +++++++++++++++++ python/pylibcudf/pylibcudf/io/types.pyx | 1 + python/pylibcudf/pylibcudf/libcudf/io/orc.pxd | 1 + .../pylibcudf/libcudf/io/orc_metadata.pxd | 2 +- .../pylibcudf/pylibcudf/tests/common/utils.py | 37 ++- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 8 +- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 53 +++ 18 files changed, 537 insertions(+), 299 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/io/orc.pxd create mode 100644 python/pylibcudf/pylibcudf/io/orc.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_orc.py diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 1938f00c179..76a6e32fde0 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -21,6 +21,10 @@ cdef add_df_col_struct_names( df, child_names_dict ) +cdef update_col_struct_field_names( + Column col, + child_names +) cdef update_struct_field_names( table, vector[column_name_info]& schema_info) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index adeba6fffb1..f88c48ce989 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -1,8 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import cudf -from cudf.core.buffer import acquire_spill_lock - from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map @@ -11,187 +8,43 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -import datetime from collections import OrderedDict -cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view - try: import ujson as json except ImportError: import json cimport pylibcudf.libcudf.io.types as cudf_io_types +cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.orc cimport ( chunked_orc_writer_options, orc_chunked_writer, - orc_reader_options, orc_writer_options, - read_orc as libcudf_read_orc, write_orc as libcudf_write_orc, ) -from pylibcudf.libcudf.io.orc_metadata cimport ( - binary_statistics, - bucket_statistics, - column_statistics, - date_statistics, - decimal_statistics, - double_statistics, - integer_statistics, - no_statistics, - parsed_orc_statistics, - read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics, - statistics_type, - string_statistics, - timestamp_statistics, -) from pylibcudf.libcudf.io.types cimport ( column_in_metadata, compression_type, sink_info, - source_info, table_input_metadata, - table_with_metadata, ) from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport data_type, size_type, type_id -from pylibcudf.variant cimport get_if as std_get_if, holds_alternative from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - make_sink_info, - make_source_info, - update_column_struct_field_names, -) +from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names +from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +import pylibcudf as plc +import cudf +from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata +from cudf.core.buffer import acquire_spill_lock -cdef _parse_column_type_statistics(column_statistics stats): - # Initialize stats to return and parse stats blob - column_stats = {} - - if stats.number_of_values.has_value(): - column_stats["number_of_values"] = stats.number_of_values.value() - - if stats.has_null.has_value(): - column_stats["has_null"] = stats.has_null.value() - - cdef statistics_type type_specific_stats = stats.type_specific_stats - - cdef integer_statistics* int_stats - cdef double_statistics* dbl_stats - cdef string_statistics* str_stats - cdef bucket_statistics* bucket_stats - cdef decimal_statistics* dec_stats - cdef date_statistics* date_stats - cdef binary_statistics* bin_stats - cdef timestamp_statistics* ts_stats - - if holds_alternative[no_statistics](type_specific_stats): - return column_stats - elif int_stats := std_get_if[integer_statistics](&type_specific_stats): - if int_stats.minimum.has_value(): - column_stats["minimum"] = int_stats.minimum.value() - else: - column_stats["minimum"] = None - if int_stats.maximum.has_value(): - column_stats["maximum"] = int_stats.maximum.value() - else: - column_stats["maximum"] = None - if int_stats.sum.has_value(): - column_stats["sum"] = int_stats.sum.value() - else: - column_stats["sum"] = None - elif dbl_stats := std_get_if[double_statistics](&type_specific_stats): - if dbl_stats.minimum.has_value(): - column_stats["minimum"] = dbl_stats.minimum.value() - else: - column_stats["minimum"] = None - if dbl_stats.maximum.has_value(): - column_stats["maximum"] = dbl_stats.maximum.value() - else: - column_stats["maximum"] = None - if dbl_stats.sum.has_value(): - column_stats["sum"] = dbl_stats.sum.value() - else: - column_stats["sum"] = None - elif str_stats := std_get_if[string_statistics](&type_specific_stats): - if str_stats.minimum.has_value(): - column_stats["minimum"] = str_stats.minimum.value().decode("utf-8") - else: - column_stats["minimum"] = None - if str_stats.maximum.has_value(): - column_stats["maximum"] = str_stats.maximum.value().decode("utf-8") - else: - column_stats["maximum"] = None - if str_stats.sum.has_value(): - column_stats["sum"] = str_stats.sum.value() - else: - column_stats["sum"] = None - elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats): - column_stats["true_count"] = bucket_stats.count[0] - column_stats["false_count"] = ( - column_stats["number_of_values"] - - column_stats["true_count"] - ) - elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats): - if dec_stats.minimum.has_value(): - column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8") - else: - column_stats["minimum"] = None - if dec_stats.maximum.has_value(): - column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8") - else: - column_stats["maximum"] = None - if dec_stats.sum.has_value(): - column_stats["sum"] = dec_stats.sum.value().decode("utf-8") - else: - column_stats["sum"] = None - elif date_stats := std_get_if[date_statistics](&type_specific_stats): - if date_stats.minimum.has_value(): - column_stats["minimum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(date_stats.minimum.value()).total_seconds(), - datetime.timezone.utc, - ) - else: - column_stats["minimum"] = None - if date_stats.maximum.has_value(): - column_stats["maximum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(date_stats.maximum.value()).total_seconds(), - datetime.timezone.utc, - ) - else: - column_stats["maximum"] = None - elif bin_stats := std_get_if[binary_statistics](&type_specific_stats): - if bin_stats.sum.has_value(): - column_stats["sum"] = bin_stats.sum.value() - else: - column_stats["sum"] = None - elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats): - # Before ORC-135, the local timezone offset was included and they were - # stored as minimum and maximum. After ORC-135, the timestamp is - # adjusted to UTC before being converted to milliseconds and stored - # in minimumUtc and maximumUtc. - # TODO: Support minimum and maximum by reading writer's local timezone - if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value(): - column_stats["minimum"] = datetime.datetime.fromtimestamp( - ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc - ) - column_stats["maximum"] = datetime.datetime.fromtimestamp( - ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc - ) - else: - raise ValueError("Unsupported statistics type") - return column_stats - - +# TODO: Consider inlining this function since it seems to only be used in one place. cpdef read_parsed_orc_statistics(filepath_or_buffer): """ Cython function to call into libcudf API, see `read_parsed_orc_statistics`. @@ -201,25 +54,13 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer): cudf.io.orc.read_orc_statistics """ - cdef parsed_orc_statistics parsed = ( - libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer])) + parsed = ( + plc.io.orc.read_parsed_orc_statistics( + plc.io.SourceInfo([filepath_or_buffer]) + ) ) - cdef vector[column_statistics] file_stats = parsed.file_stats - cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats - - parsed_file_stats = [ - _parse_column_type_statistics(file_stats[column_index]) - for column_index in range(file_stats.size()) - ] - - parsed_stripes_stats = [ - [_parse_column_type_statistics(stripes_stats[stripe_index][column_index]) - for column_index in range(stripes_stats[stripe_index].size())] - for stripe_index in range(stripes_stats.size()) - ] - - return parsed.column_names, parsed_file_stats, parsed_stripes_stats + return parsed.column_names, parsed.file_stats, parsed.stripes_stats cpdef read_orc(object filepaths_or_buffers, @@ -235,36 +76,34 @@ cpdef read_orc(object filepaths_or_buffers, See Also -------- cudf.read_orc + + Notes + ----- + Currently this function only considers the metadata of the first file in the list of + filepaths_or_buffers. """ - cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( - filepaths_or_buffers, + + if columns is not None: + columns = [str(col) for col in columns] + + tbl_w_meta = plc.io.orc.read_orc( + plc.io.SourceInfo(filepaths_or_buffers), columns, - stripes or [], + stripes, get_skiprows_arg(skip_rows), get_num_rows_arg(num_rows), - ( - type_id.EMPTY - if timestamp_type is None else - ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) - ) - ), use_index, + plc.types.DataType( + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ + cudf.dtype(timestamp_type) + ] + ) ) - cdef table_with_metadata c_result - cdef size_type nrows + names = tbl_w_meta.column_names(include_children=False) - with nogil: - c_result = move(libcudf_read_orc(c_orc_reader_options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] actual_index_names, col_names, is_range_index, reset_index_name, \ - range_idx = _get_index_from_metadata(c_result.metadata.user_data, + range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data, names, skip_rows, num_rows) @@ -272,11 +111,11 @@ cpdef read_orc(object filepaths_or_buffers, if columns is not None and (isinstance(columns, list) and len(columns) == 0): # When `columns=[]`, index needs to be # established, but not the columns. - nrows = c_result.tbl.get()[0].view().num_rows() + nrows = tbl_w_meta.tbl.num_rows() return {}, cudf.RangeIndex(nrows) - data, index = data_from_unique_ptr( - move(c_result.tbl), + data, index = data_from_pylibcudf_io( + tbl_w_meta, col_names if columns is None else names, actual_index_names ) @@ -286,11 +125,13 @@ cpdef read_orc(object filepaths_or_buffers, elif reset_index_name: index.names = [None] * len(index.names) + child_name_values = tbl_w_meta.child_names.values() + data = { - name: update_column_struct_field_names( - col, c_result.metadata.schema_info[i] + name: update_col_struct_field_names( + col, child_names ) - for i, (name, col) in enumerate(data.items()) + for (name, col), child_names in zip(data.items(), child_name_values) } return data, index @@ -313,32 +154,35 @@ cdef compression_type _get_comp_type(object compression): raise ValueError(f"Unsupported `compression` type {compression}") cdef tuple _get_index_from_metadata( - map[string, string] user_data, + vector[map[string, string]] user_data, object names, object skip_rows, object num_rows): - json_str = user_data[b'pandas'].decode('utf-8') + meta = None index_col = None is_range_index = False reset_index_name = False range_idx = None - if json_str != "": - meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'] - if isinstance(index_col[0], dict) and \ - index_col[0]['kind'] == 'range': - is_range_index = True - else: - index_col_names = OrderedDict() - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = \ - c['name'] or c['field_name'] - if c['name'] is None: - reset_index_name = True + + if user_data.size() > 0: + json_str = user_data[0][b'pandas'].decode('utf-8') + if json_str != "": + meta = json.loads(json_str) + if 'index_columns' in meta and len(meta['index_columns']) > 0: + index_col = meta['index_columns'] + if isinstance(index_col[0], dict) and \ + index_col[0]['kind'] == 'range': + is_range_index = True + else: + index_col_names = OrderedDict() + for idx_col in index_col: + for c in meta['columns']: + if c['field_name'] == idx_col: + index_col_names[idx_col] = \ + c['name'] or c['field_name'] + if c['name'] is None: + reset_index_name = True actual_index_names = None if index_col is not None and len(index_col) > 0: @@ -473,41 +317,6 @@ cdef int64_t get_num_rows_arg(object arg) except*: return arg -cdef orc_reader_options make_orc_reader_options( - object filepaths_or_buffers, - object column_names, - object stripes, - int64_t skip_rows, - int64_t num_rows, - type_id timestamp_type, - bool use_index -) except*: - - cdef vector[vector[size_type]] strps = stripes - cdef orc_reader_options opts - cdef source_info src = make_source_info(filepaths_or_buffers) - opts = move( - orc_reader_options.builder(src) - .stripes(strps) - .skip_rows(skip_rows) - .timestamp_type(data_type(timestamp_type)) - .use_index(use_index) - .build() - ) - if num_rows >= 0: - opts.set_num_rows(num_rows) - - cdef vector[string] c_column_names - if column_names is not None: - c_column_names.reserve(len(column_names)) - for col in column_names: - c_column_names.push_back(str(col).encode()) - if len(column_names) > 0: - opts.set_columns(c_column_names) - - return opts - - cdef class ORCWriter: """ ORCWriter lets you you incrementally write out a ORC file from a series diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index ff97fe80310..7254db5c43d 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cdef data_from_pylibcudf_io(tbl_with_meta) +cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 8660cca9322..9e5b99f64eb 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -316,15 +316,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) -cdef data_from_pylibcudf_io(tbl_with_meta): +cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): """ Unpacks the TableWithMetadata from libcudf I/O into a dict of columns and an Index (cuDF format) """ + if column_names is None: + column_names = tbl_with_meta.column_names(include_children=False) return _data_from_columns( columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=tbl_with_meta.column_names(include_children=False), - index_names=None + column_names=column_names, + index_names=index_names ) cdef columns_from_table_view( diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index c54293badbe..68b60809bb9 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -181,11 +181,6 @@ def read_orc_statistics( parsed_stripes_statistics, ) = liborc.read_parsed_orc_statistics(path_or_buf) - # Parse column names - column_names = [ - column_name.decode("utf-8") for column_name in column_names - ] - # Parse file statistics file_statistics = { column_name: column_stats @@ -248,9 +243,9 @@ def _filter_stripes( num_rows_scanned = 0 for i, stripe_statistics in enumerate(stripes_statistics): num_rows_before_stripe = num_rows_scanned - num_rows_scanned += next(iter(stripe_statistics.values()))[ - "number_of_values" - ] + num_rows_scanned += next( + iter(stripe_statistics.values()) + ).number_of_values if stripes is not None and i not in stripes: continue if skip_rows is not None and num_rows_scanned <= skip_rows: diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c2a30b76bea..1dd732c7191 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -184,25 +184,25 @@ def test_orc_read_statistics(datadir): pytest.skip(".orc file is not found: %s" % e) # Check numberOfValues - assert_eq(file_statistics[0]["int1"]["number_of_values"], 11_000) + assert_eq(file_statistics[0]["int1"].number_of_values, 11_000) assert_eq( - file_statistics[0]["int1"]["number_of_values"], + file_statistics[0]["int1"].number_of_values, sum( [ - stripes_statistics[0]["int1"]["number_of_values"], - stripes_statistics[1]["int1"]["number_of_values"], - stripes_statistics[2]["int1"]["number_of_values"], + stripes_statistics[0]["int1"].number_of_values, + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[2]["int1"].number_of_values, ] ), ) assert_eq( - stripes_statistics[1]["int1"]["number_of_values"], - stripes_statistics[1]["string1"]["number_of_values"], + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[1]["string1"].number_of_values, ) - assert_eq(stripes_statistics[2]["string1"]["number_of_values"], 1_000) + assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000) # Check other statistics - assert_eq(stripes_statistics[2]["string1"]["has_null"], False) + assert_eq(stripes_statistics[2]["string1"].has_null, False) assert_eq( file_statistics[0]["int1"]["minimum"], min( @@ -1538,8 +1538,8 @@ def test_empty_statistics(): for stats in got: # Similar expected stats for the first 6 columns in this case for col_name in ascii_lowercase[:6]: - assert stats[0][col_name].get("number_of_values") == 0 - assert stats[0][col_name].get("has_null") is True + assert stats[0][col_name].number_of_values == 0 + assert stats[0][col_name].has_null is True assert stats[0][col_name].get("minimum") is None assert stats[0][col_name].get("maximum") is None for col_name in ascii_lowercase[:3]: @@ -1547,17 +1547,17 @@ def test_empty_statistics(): # Sum for decimal column is a string assert stats[0]["d"].get("sum") == "0" - assert stats[0]["g"].get("number_of_values") == 0 - assert stats[0]["g"].get("has_null") is True + assert stats[0]["g"].number_of_values == 0 + assert stats[0]["g"].has_null is True assert stats[0]["g"].get("true_count") == 0 assert stats[0]["g"].get("false_count") == 0 - assert stats[0]["h"].get("number_of_values") == 0 - assert stats[0]["h"].get("has_null") is True + assert stats[0]["h"].number_of_values == 0 + assert stats[0]["h"].has_null is True assert stats[0]["h"].get("sum") == 0 - assert stats[0]["i"].get("number_of_values") == 1 - assert stats[0]["i"].get("has_null") is False + assert stats[0]["i"].number_of_values == 1 + assert stats[0]["i"].has_null is False assert stats[0]["i"].get("minimum") == 1 assert stats[0]["i"].get("maximum") == 1 assert stats[0]["i"].get("sum") == 1 diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1180da321e6..d636f36f282 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1873,7 +1873,7 @@ def _apply_filter_bool_eq(val, col_stats): return False elif val is False: if (col_stats["false_count"] == 0) or ( - col_stats["true_count"] == col_stats["number_of_values"] + col_stats["true_count"] == col_stats.number_of_values ): return False return True @@ -1900,7 +1900,7 @@ def _apply_predicate(op, val, col_stats): return False # TODO: Replace pd.isnull with # cudf.isnull once it is implemented - if pd.isnull(val) and not col_stats["has_null"]: + if pd.isnull(val) and not col_stats.has_null: return False if not _apply_filter_bool_eq(val, col_stats): return False diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index bcc2151f5b6..529a71a48ce 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx) +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 62820048584..5927a19dc69 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, parquet, types +from . cimport avro, datasource, json, orc, parquet, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 27640f7d955..5d899ee0808 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, parquet, types +from . import avro, csv, datasource, json, orc, parquet, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd new file mode 100644 index 00000000000..b111d617b1b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint64_t +from libcpp cimport bool +from libcpp.optional cimport optional +from libcpp.string cimport string +from libcpp.vector cimport vector +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.libcudf.io.orc_metadata cimport ( + column_statistics, + parsed_orc_statistics, + statistics_type, +) +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.types cimport DataType + + +cpdef TableWithMetadata read_orc( + SourceInfo source_info, + list columns = *, + list stripes = *, + size_type skip_rows = *, + size_type nrows = *, + bool use_index = *, + bool use_np_dtypes = *, + DataType timestamp_type = *, + list decimal128_columns = * +) + +cdef class OrcColumnStatistics: + cdef optional[uint64_t] number_of_values_c + cdef optional[bool] has_null_c + cdef statistics_type type_specific_stats_c + cdef dict column_stats + + cdef void _init_stats_dict(self) + + @staticmethod + cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats) + + +cdef class ParsedOrcStatistics: + cdef parsed_orc_statistics c_obj + + @staticmethod + cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats) + + +cpdef ParsedOrcStatistics read_parsed_orc_statistics( + SourceInfo source_info +) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx new file mode 100644 index 00000000000..01a5e4b04a1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -0,0 +1,302 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +import datetime + +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.libcudf.io.orc cimport ( + orc_reader_options, + read_orc as cpp_read_orc, +) +from pylibcudf.libcudf.io.orc_metadata cimport ( + binary_statistics, + bucket_statistics, + column_statistics, + date_statistics, + decimal_statistics, + double_statistics, + integer_statistics, + no_statistics, + read_parsed_orc_statistics as cpp_read_parsed_orc_statistics, + statistics_type, + string_statistics, + timestamp_statistics, +) +from pylibcudf.libcudf.io.types cimport table_with_metadata +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.types cimport DataType +from pylibcudf.variant cimport get_if, holds_alternative + + +cdef class OrcColumnStatistics: + def __init__(self): + raise TypeError( + "OrcColumnStatistics should not be instantiated by users. If it is " + "being constructed in Cython from a preexisting libcudf object, " + "use `OrcColumnStatistics.from_libcudf` instead." + ) + + @property + def number_of_values(self): + if self.number_of_values_c.has_value(): + return self.number_of_values_c.value() + return None + + @property + def has_null(self): + if self.has_null_c.has_value(): + return self.has_null_c.value() + return None + + cdef void _init_stats_dict(self): + # Initialize stats to return and parse stats blob + self.column_stats = {} + + cdef statistics_type type_specific_stats = self.type_specific_stats_c + + cdef integer_statistics* int_stats + cdef double_statistics* dbl_stats + cdef string_statistics* str_stats + cdef bucket_statistics* bucket_stats + cdef decimal_statistics* dec_stats + cdef date_statistics* date_stats + cdef binary_statistics* bin_stats + cdef timestamp_statistics* ts_stats + + if holds_alternative[no_statistics](type_specific_stats): + pass + elif int_stats := get_if[integer_statistics](&type_specific_stats): + if int_stats.minimum.has_value(): + self.column_stats["minimum"] = int_stats.minimum.value() + else: + self.column_stats["minimum"] = None + if int_stats.maximum.has_value(): + self.column_stats["maximum"] = int_stats.maximum.value() + else: + self.column_stats["maximum"] = None + if int_stats.sum.has_value(): + self.column_stats["sum"] = int_stats.sum.value() + else: + self.column_stats["sum"] = None + elif dbl_stats := get_if[double_statistics](&type_specific_stats): + if dbl_stats.minimum.has_value(): + self.column_stats["minimum"] = dbl_stats.minimum.value() + else: + self.column_stats["minimum"] = None + if dbl_stats.maximum.has_value(): + self.column_stats["maximum"] = dbl_stats.maximum.value() + else: + self.column_stats["maximum"] = None + if dbl_stats.sum.has_value(): + self.column_stats["sum"] = dbl_stats.sum.value() + else: + self.column_stats["sum"] = None + elif str_stats := get_if[string_statistics](&type_specific_stats): + if str_stats.minimum.has_value(): + self.column_stats["minimum"] = str_stats.minimum.value().decode("utf-8") + else: + self.column_stats["minimum"] = None + if str_stats.maximum.has_value(): + self.column_stats["maximum"] = str_stats.maximum.value().decode("utf-8") + else: + self.column_stats["maximum"] = None + if str_stats.sum.has_value(): + self.column_stats["sum"] = str_stats.sum.value() + else: + self.column_stats["sum"] = None + elif bucket_stats := get_if[bucket_statistics](&type_specific_stats): + self.column_stats["true_count"] = bucket_stats.count[0] + self.column_stats["false_count"] = ( + self.number_of_values + - self.column_stats["true_count"] + ) + elif dec_stats := get_if[decimal_statistics](&type_specific_stats): + if dec_stats.minimum.has_value(): + self.column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8") + else: + self.column_stats["minimum"] = None + if dec_stats.maximum.has_value(): + self.column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8") + else: + self.column_stats["maximum"] = None + if dec_stats.sum.has_value(): + self.column_stats["sum"] = dec_stats.sum.value().decode("utf-8") + else: + self.column_stats["sum"] = None + elif date_stats := get_if[date_statistics](&type_specific_stats): + if date_stats.minimum.has_value(): + self.column_stats["minimum"] = datetime.datetime.fromtimestamp( + datetime.timedelta(date_stats.minimum.value()).total_seconds(), + datetime.timezone.utc, + ) + else: + self.column_stats["minimum"] = None + if date_stats.maximum.has_value(): + self.column_stats["maximum"] = datetime.datetime.fromtimestamp( + datetime.timedelta(date_stats.maximum.value()).total_seconds(), + datetime.timezone.utc, + ) + else: + self.column_stats["maximum"] = None + elif bin_stats := get_if[binary_statistics](&type_specific_stats): + if bin_stats.sum.has_value(): + self.column_stats["sum"] = bin_stats.sum.value() + else: + self.column_stats["sum"] = None + elif ts_stats := get_if[timestamp_statistics](&type_specific_stats): + # Before ORC-135, the local timezone offset was included and they were + # stored as minimum and maximum. After ORC-135, the timestamp is + # adjusted to UTC before being converted to milliseconds and stored + # in minimumUtc and maximumUtc. + # TODO: Support minimum and maximum by reading writer's local timezone + if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value(): + self.column_stats["minimum"] = datetime.datetime.fromtimestamp( + ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc + ) + self.column_stats["maximum"] = datetime.datetime.fromtimestamp( + ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc + ) + else: + raise ValueError("Unsupported statistics type") + + def __getitem__(self, item): + return self.column_stats[item] + + def __contains__(self, item): + return item in self.column_stats + + def get(self, item, default=None): + return self.column_stats.get(item, default) + + @staticmethod + cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats): + cdef OrcColumnStatistics out = OrcColumnStatistics.__new__(OrcColumnStatistics) + out.number_of_values_c = col_stats.number_of_values + out.has_null_c = col_stats.has_null + out.type_specific_stats_c = col_stats.type_specific_stats + out._init_stats_dict() + return out + + +cdef class ParsedOrcStatistics: + + @property + def column_names(self): + return [name.decode() for name in self.c_obj.column_names] + + @property + def file_stats(self): + return [ + OrcColumnStatistics.from_libcudf(self.c_obj.file_stats[i]) + for i in range(self.c_obj.file_stats.size()) + ] + + @property + def stripes_stats(self): + return [ + [ + OrcColumnStatistics.from_libcudf(stripe_stats_c[i]) + for i in range(stripe_stats_c.size()) + ] + for stripe_stats_c in self.c_obj.stripes_stats + ] + + @staticmethod + cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats): + cdef ParsedOrcStatistics out = ParsedOrcStatistics.__new__(ParsedOrcStatistics) + out.c_obj = move(orc_stats) + return out + + +cpdef TableWithMetadata read_orc( + SourceInfo source_info, + list columns = None, + list stripes = None, + size_type skip_rows = 0, + size_type nrows = -1, + bool use_index = True, + bool use_np_dtypes = True, + DataType timestamp_type = None, + list decimal128_columns = None, +): + """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo object to read the Parquet file from. + columns : list, default None + The string names of the columns to be read. + stripes : list[list[size_type]], default None + List of stripes to be read. + skip_rows : int64_t, default 0 + The number of rows to skip from the start of the file. + nrows : size_type, default -1 + The number of rows to read. By default, read the entire file. + use_index : bool, default True + Whether to use the row index to speed up reading. + use_np_dtypes : bool, default True + Whether to use numpy compatible dtypes. + timestamp_type : DataType, default None + The timestamp type to use for the timestamp columns. + decimal128_columns : list, default None + List of column names to be read as 128-bit decimals. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata (column names) that were read in. + """ + cdef orc_reader_options opts + cdef vector[vector[size_type]] c_stripes + opts = move( + orc_reader_options.builder(source_info.c_obj) + .use_index(use_index) + .build() + ) + if nrows >= 0: + opts.set_num_rows(nrows) + if skip_rows >= 0: + opts.set_skip_rows(skip_rows) + if stripes is not None: + c_stripes = stripes + opts.set_stripes(c_stripes) + if timestamp_type is not None: + opts.set_timestamp_type(timestamp_type.c_obj) + + cdef vector[string] c_decimal128_columns + if decimal128_columns is not None and len(decimal128_columns) > 0: + c_decimal128_columns.reserve(len(decimal128_columns)) + for col in decimal128_columns: + if not isinstance(col, str): + raise TypeError("Decimal 128 column names must be strings!") + c_decimal128_columns.push_back(col.encode()) + opts.set_decimal128_columns(c_decimal128_columns) + + cdef vector[string] c_column_names + if columns is not None and len(columns) > 0: + c_column_names.reserve(len(columns)) + for col in columns: + if not isinstance(col, str): + raise TypeError("Column names must be strings!") + c_column_names.push_back(col.encode()) + opts.set_columns(c_column_names) + + cdef table_with_metadata c_result + + with nogil: + c_result = move(cpp_read_orc(opts)) + + return TableWithMetadata.from_libcudf(c_result) + + +cpdef ParsedOrcStatistics read_parsed_orc_statistics( + SourceInfo source_info +): + cdef parsed_orc_statistics parsed = ( + cpp_read_parsed_orc_statistics(source_info.c_obj) + ) + return ParsedOrcStatistics.from_libcudf(parsed) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 1600a805b37..563a02761da 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -130,6 +130,7 @@ cdef class TableWithMetadata: """ return self.metadata.per_file_user_data + cdef class SourceInfo: """A class containing details on a source to read from. diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd index e4a09b8feb2..dca24c7f665 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd @@ -35,6 +35,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_index(bool val) except + void enable_use_np_dtypes(bool val) except + void set_timestamp_type(data_type type) except + + void set_decimal128_columns(vector[string] val) except + @staticmethod orc_reader_options_builder builder( diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd index db6cb0cdfa5..9302ffe2f80 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd @@ -1,11 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.libcudf.io cimport types as cudf_io_types from pylibcudf.variant cimport monostate, variant diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index babe6634318..9f389fa42c4 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -9,6 +9,7 @@ import pyarrow.compute as pc import pylibcudf as plc import pytest +from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table from pylibcudf.io.types import CompressionType @@ -242,13 +243,21 @@ def is_nested_list(typ): return nesting_level(typ)[0] > 1 -def _convert_numeric_types_to_floating(pa_table): +def _convert_types(pa_table, input_pred, result_type): """ - Useful little helper for testing the - dtypes option in I/O readers. + Useful little helper for testing the dtypes option in I/O readers. - Returns a tuple containing the pylibcudf dtypes - and the new pyarrow schema + Returns a tuple containing the pylibcudf dtypes and the new pyarrow schema based on + the data in the table. + + Parameters + ---------- + pa_table : pyarrow.Table + The table from which to extract the dtypes + input_pred : function + Predicate that evaluates to true for types to replace + result_type : pa.DataType + The type to cast to """ dtypes = [] new_fields = [] @@ -257,11 +266,9 @@ def _convert_numeric_types_to_floating(pa_table): child_types = [] plc_type = plc.interop.from_arrow(field.type) - if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer( - field.type - ): - plc_type = plc.interop.from_arrow(pa.float64()) - field = field.with_type(pa.float64()) + if input_pred(field.type): + plc_type = plc.interop.from_arrow(result_type) + field = field.with_type(result_type) dtypes.append((field.name, plc_type, child_types)) @@ -332,6 +339,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs): if isinstance(path_or_buf, io.IOBase) else path_or_buf, ) + elif format == "orc": + # The conversion to pandas is lossy (doesn't preserve + # nested types) so we + # will just use pyarrow directly to write this + orc_write_table( + pa_table, + pa.PythonFile(path_or_buf) + if isinstance(path_or_buf, io.IOBase) + else path_or_buf, + ) if isinstance(path_or_buf, io.IOBase): path_or_buf.seek(0) return path_or_buf diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ccd7eef54f3..ab26f23418d 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -9,7 +9,7 @@ import pytest from pylibcudf.io.types import CompressionType from utils import ( - _convert_numeric_types_to_floating, + _convert_types, assert_table_and_meta_eq, make_source, write_source_str, @@ -148,7 +148,11 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): if usecols is not None: pa_table = pa_table.select(usecols) - dtypes, new_fields = _convert_numeric_types_to_floating(pa_table) + dtypes, new_fields = _convert_types( + pa_table, + lambda t: (pa.types.is_unsigned_integer(t) or pa.types.is_integer(t)), + pa.float64(), + ) # Extract the dtype out of the (name, type, child_types) tuple # (read_csv doesn't support this format since it doesn't support nested columns) dtypes = {name: dtype for name, dtype, _ in dtypes} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py new file mode 100644 index 00000000000..42b14b1feff --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import _convert_types, assert_table_and_meta_eq, make_source + +# Shared kwargs to pass to make_source +_COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} + + +@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]]) +def test_read_orc_basic( + table_data, binary_source_or_sink, nrows_skiprows, columns +): + _, pa_table = table_data + nrows, skiprows = nrows_skiprows + + # ORC reader doesn't support skip_rows for nested columns + if skiprows > 0: + colnames_to_drop = [] + for i in range(len(pa_table.schema)): + field = pa_table.schema.field(i) + + if pa.types.is_nested(field.type): + colnames_to_drop.append(field.name) + pa_table = pa_table.drop(colnames_to_drop) + # ORC doesn't support unsigned ints + # let's cast to int64 + _, new_fields = _convert_types( + pa_table, pa.types.is_unsigned_integer, pa.int64() + ) + pa_table = pa_table.cast(pa.schema(new_fields)) + + source = make_source( + binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS + ) + + res = plc.io.orc.read_orc( + plc.io.SourceInfo([source]), + nrows=nrows, + skip_rows=skiprows, + columns=columns, + ) + + if columns is not None: + pa_table = pa_table.select(columns) + + # Adapt to nrows/skiprows + pa_table = pa_table.slice( + offset=skiprows, length=nrows if nrows != -1 else None + ) + + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) From 503ce030f9523eda83677caafdd221385348a69c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:11:03 -1000 Subject: [PATCH 03/26] Add transpose API to pylibcudf (#16749) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16749 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/transpose.rst | 6 +++ python/cudf/cudf/_lib/transpose.pyx | 30 ++++----------- python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 2 + python/pylibcudf/pylibcudf/__init__.py | 2 + .../pylibcudf/tests/test_transpose.py | 32 ++++++++++++++++ python/pylibcudf/pylibcudf/transpose.pxd | 5 +++ python/pylibcudf/pylibcudf/transpose.pyx | 38 +++++++++++++++++++ 9 files changed, 95 insertions(+), 22 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst create mode 100644 python/pylibcudf/pylibcudf/tests/test_transpose.py create mode 100644 python/pylibcudf/pylibcudf/transpose.pxd create mode 100644 python/pylibcudf/pylibcudf/transpose.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index d6f8cd2a1ff..edb0963ed29 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -38,6 +38,7 @@ This page provides API documentation for pylibcudf. table traits transform + transpose types unary diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst new file mode 100644 index 00000000000..6241295e770 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst @@ -0,0 +1,6 @@ +========= +transpose +========= + +.. automodule:: pylibcudf.transpose + :members: diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index f78fbd4c844..995d278cb88 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -1,32 +1,18 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose +import pylibcudf as plc from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns def transpose(list source_columns): """Transpose m n-row columns into n m-row columns """ - cdef pair[unique_ptr[column], table_view] c_result - cdef table_view c_input = table_view_from_columns(source_columns) - - with nogil: - c_result = move(cpp_transpose(c_input)) - - # Notice, the data pointer of `result_owner` has been exposed - # through `c_result.second` at this point. - result_owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - return columns_from_table_view( - c_result.second, - owners=[result_owner] * c_result.second.num_columns() + input_table = plc.table.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) + result_table = plc.transpose.transpose(input_table) + return [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in result_table.columns() + ] diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index f07c8897e34..fb3a6c13a70 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -44,6 +44,7 @@ set(cython_sources table.pyx traits.pyx transform.pyx + transpose.pyx types.pyx unary.pyx utils.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index b7cf6413c05..66d9c3d6165 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -29,6 +29,7 @@ from . cimport ( strings, traits, transform, + transpose, types, unary, ) @@ -72,6 +73,7 @@ __all__ = [ "sorting", "traits", "transform", + "transpose", "types", "unary", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 84b1c29f791..0a3615fa941 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -40,6 +40,7 @@ strings, traits, transform, + transpose, types, unary, ) @@ -86,6 +87,7 @@ "sorting", "traits", "transform", + "transpose", "types", "unary", ] diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py new file mode 100644 index 00000000000..ac11123f680 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from packaging.version import parse + + +@pytest.mark.skipif( + parse(pa.__version__) < parse("16.0.0"), + reason="https://github.com/apache/arrow/pull/40070", +) +@pytest.mark.parametrize( + "arr", + [ + [], + [1, 2, 3], + [1, 2], + [1], + ], +) +def test_transpose(arr): + data = {"a": arr, "b": arr} + arrow_tbl = pa.table(data) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + plc_result = plc.transpose.transpose(plc_tbl) + result = plc.interop.to_arrow(plc_result) + expected = pa.Table.from_pandas( + arrow_tbl.to_pandas().T, preserve_index=False + ).rename_columns([""] * len(arr)) + expected = pa.table(expected, schema=result.schema) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd new file mode 100644 index 00000000000..7b5a7676b49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pxd @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from .table cimport Table + + +cpdef Table transpose(Table input_table) diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx new file mode 100644 index 00000000000..a708f6cc37f --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from pylibcudf.libcudf cimport transpose as cpp_transpose +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table_view cimport table_view + +from .column cimport Column +from .table cimport Table + + +cpdef Table transpose(Table input_table): + """Transpose a Table. + + For details, see :cpp:func:`transpose`. + + Parameters + ---------- + input_table : Table + Table to transpose + + Returns + ------- + Table + Transposed table. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transpose.transpose(input_table.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table) From 0425963e14570fc723e3804f0bd7de7460d295f2 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 25 Sep 2024 17:43:07 -0500 Subject: [PATCH 04/26] Add experimental `filesystem="arrow"` support in `dask_cudf.read_parquet` (#16684) This PR piggybacks on the existing CPU/Arrow Parquet infrastructure in dask-expr. With this PR, ```python df = dask_cudf.read_parquet(path, filesystem="arrow") ``` will produce a `cudf`-backed collection using PyArrow for IO (i.e. disk->`pa.Table`->`cudf.DataFrame`). Before this PR, passing `filesystem="arrow"` will simply result in an error. Although this code path is not ideal for fast/local storage, it can be **very** efficient for remote storage (e.g. S3). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Murray (https://github.com/Matt711) - David Wendt (https://github.com/davidwendt) - Tianyu Liu (https://github.com/kingcrimsontianyu) - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) - https://github.com/brandon-b-miller - https://github.com/nvdbaranec Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16684 --- docs/dask_cudf/source/best_practices.rst | 9 ++ docs/dask_cudf/source/index.rst | 7 +- python/dask_cudf/dask_cudf/backends.py | 142 +++++++++++++++++- python/dask_cudf/dask_cudf/expr/_expr.py | 89 +++++++++++ .../dask_cudf/dask_cudf/io/tests/test_s3.py | 41 +++-- 5 files changed, 267 insertions(+), 21 deletions(-) diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst index 142124163af..83039f86fed 100644 --- a/docs/dask_cudf/source/best_practices.rst +++ b/docs/dask_cudf/source/best_practices.rst @@ -252,6 +252,15 @@ result in a simple 1-to-1 mapping between files and output partitions. correspond to a reasonable partition size, use ``blocksize=None`` to avoid unnecessary metadata collection. +.. note:: + When reading from remote storage (e.g. S3 and GCS), performance will + likely improve with ``filesystem="arrow"``. When this option is set, + PyArrow will be used to perform IO on multiple CPU threads. Please be + aware that this feature is experimental, and behavior may change in + the future (without deprecation). Do not pass in ``blocksize`` or + ``aggregate_files`` when this feature is used. Instead, set the + ``"dataframe.parquet.minimum-partition-size"`` config to control + file aggregation. Use :func:`from_map` ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 23ca7e49753..6eb755d7854 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -40,9 +40,10 @@ Using Dask cuDF The Dask DataFrame API (Recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply use the `Dask configuration `__ system to -set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, -this can be achieved like so:: +Simply use the `Dask configuration +`__ +system to set the ``"dataframe.backend"`` option to ``"cudf"``. +From Python, this can be achieved like so:: import dask diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 9347ebba5de..bead964a0ef 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from packaging.version import Version from pandas.api.types import is_scalar import dask.dataframe as dd @@ -52,6 +53,10 @@ get_parallel_type.register(cudf.BaseIndex, lambda _: Index) +# Required for Arrow filesystem support in read_parquet +PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") + + @meta_nonempty.register(cudf.BaseIndex) @_dask_cudf_performance_tracking def _nonempty_index(idx): @@ -695,15 +700,140 @@ def from_dict( ) @staticmethod - def read_parquet(*args, engine=None, **kwargs): + def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs): import dask_expr as dx + import fsspec - from dask_cudf.io.parquet import CudfEngine + if ( + isinstance(filesystem, fsspec.AbstractFileSystem) + or isinstance(filesystem, str) + and filesystem.lower() == "fsspec" + ): + # Default "fsspec" filesystem + from dask_cudf.io.parquet import CudfEngine - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dx.read_parquet, *args, engine=CudfEngine, **kwargs - ) + _raise_unsupported_parquet_kwargs(**kwargs) + return _default_backend( + dx.read_parquet, + path, + *args, + filesystem=filesystem, + engine=CudfEngine, + **kwargs, + ) + + else: + # EXPERIMENTAL filesystem="arrow" support. + # This code path uses PyArrow for IO, which is only + # beneficial for remote storage (e.g. S3) + + from fsspec.utils import stringify_path + from pyarrow import fs as pa_fs + + # CudfReadParquetPyarrowFS requires import of distributed beforehand + # (See: https://github.com/dask/dask/issues/11352) + import distributed # noqa: F401 + from dask.core import flatten + from dask.dataframe.utils import pyarrow_strings_enabled + + from dask_cudf.expr._expr import CudfReadParquetPyarrowFS + + if args: + raise ValueError(f"Unexpected positional arguments: {args}") + + if not ( + isinstance(filesystem, pa_fs.FileSystem) + or isinstance(filesystem, str) + and filesystem.lower() in ("arrow", "pyarrow") + ): + raise ValueError(f"Unexpected filesystem value: {filesystem}.") + + if not PYARROW_GE_15: + raise NotImplementedError( + "Experimental Arrow filesystem support requires pyarrow>=15" + ) + + if not isinstance(path, str): + path = stringify_path(path) + + # Extract kwargs + columns = kwargs.pop("columns", None) + filters = kwargs.pop("filters", None) + categories = kwargs.pop("categories", None) + index = kwargs.pop("index", None) + storage_options = kwargs.pop("storage_options", None) + dtype_backend = kwargs.pop("dtype_backend", None) + calculate_divisions = kwargs.pop("calculate_divisions", False) + ignore_metadata_file = kwargs.pop("ignore_metadata_file", False) + metadata_task_size = kwargs.pop("metadata_task_size", None) + split_row_groups = kwargs.pop("split_row_groups", "infer") + blocksize = kwargs.pop("blocksize", "default") + aggregate_files = kwargs.pop("aggregate_files", None) + parquet_file_extension = kwargs.pop( + "parquet_file_extension", (".parq", ".parquet", ".pq") + ) + arrow_to_pandas = kwargs.pop("arrow_to_pandas", None) + open_file_options = kwargs.pop("open_file_options", None) + + # Validate and normalize kwargs + kwargs["dtype_backend"] = dtype_backend + if arrow_to_pandas is not None: + raise ValueError( + "arrow_to_pandas not supported for the 'cudf' backend." + ) + if open_file_options is not None: + raise ValueError( + "The open_file_options argument is no longer supported " + "by the 'cudf' backend." + ) + if filters is not None: + for filter in flatten(filters, container=list): + _, op, val = filter + if op == "in" and not isinstance(val, (set, list, tuple)): + raise TypeError( + "Value of 'in' filter must be a list, set or tuple." + ) + if metadata_task_size is not None: + raise NotImplementedError( + "metadata_task_size is not supported when using the pyarrow filesystem." + ) + if split_row_groups != "infer": + raise NotImplementedError( + "split_row_groups is not supported when using the pyarrow filesystem." + ) + if parquet_file_extension != (".parq", ".parquet", ".pq"): + raise NotImplementedError( + "parquet_file_extension is not supported when using the pyarrow filesystem." + ) + if blocksize is not None and blocksize != "default": + warnings.warn( + "blocksize is not supported when using the pyarrow filesystem." + "blocksize argument will be ignored." + ) + if aggregate_files is not None: + warnings.warn( + "aggregate_files is not supported when using the pyarrow filesystem. " + "Please use the 'dataframe.parquet.minimum-partition-size' config." + "aggregate_files argument will be ignored." + ) + + return dx.new_collection( + CudfReadParquetPyarrowFS( + path, + columns=dx._util._convert_to_list(columns), + filters=filters, + categories=categories, + index=index, + calculate_divisions=calculate_divisions, + storage_options=storage_options, + filesystem=filesystem, + ignore_metadata_file=ignore_metadata_file, + arrow_to_pandas=arrow_to_pandas, + pyarrow_strings_enabled=pyarrow_strings_enabled(), + kwargs=kwargs, + _series=isinstance(columns, str), + ) + ) @staticmethod def read_csv( diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index b284ab3774d..af83a01da98 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -2,10 +2,13 @@ import functools import dask_expr._shuffle as _shuffle_module +import pandas as pd from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var +from dask_expr.io.io import FusedParquetIO +from dask_expr.io.parquet import ReadParquetPyarrowFS from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype @@ -18,6 +21,92 @@ ## +class CudfFusedParquetIO(FusedParquetIO): + @staticmethod + def _load_multiple_files( + frag_filters, + columns, + schema, + *to_pandas_args, + ): + import pyarrow as pa + + from dask.base import apply, tokenize + from dask.threaded import get + + token = tokenize(frag_filters, columns, schema) + name = f"pq-file-{token}" + dsk = { + (name, i): ( + CudfReadParquetPyarrowFS._fragment_to_table, + frag, + filter, + columns, + schema, + ) + for i, (frag, filter) in enumerate(frag_filters) + } + dsk[name] = ( + apply, + pa.concat_tables, + [list(dsk.keys())], + {"promote_options": "permissive"}, + ) + return CudfReadParquetPyarrowFS._table_to_pandas( + get(dsk, name), + *to_pandas_args, + ) + + +class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): + @functools.cached_property + def _dataset_info(self): + from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema + + dataset_info = super()._dataset_info + meta_pd = dataset_info["base_meta"] + if isinstance(meta_pd, cudf.DataFrame): + return dataset_info + + # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) + meta_cudf = cudf.from_pandas(meta_pd) + + # Re-set "object" dtypes to align with pa schema + kwargs = dataset_info.get("kwargs", {}) + set_object_dtypes_from_pa_schema( + meta_cudf, + kwargs.get("schema", None), + ) + + dataset_info["base_meta"] = meta_cudf + self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( + dataset_info + ) + return dataset_info + + @staticmethod + def _table_to_pandas( + table, + index_name, + *args, + ): + df = cudf.DataFrame.from_arrow(table) + if index_name is not None: + df = df.set_index(index_name) + return df + + def _tune_up(self, parent): + if self._fusion_compression_factor >= 1: + return + if isinstance(parent, CudfFusedParquetIO): + return + return parent.substitute(self, CudfFusedParquetIO(self)) + + class RenameAxisCudf(RenameAxis): # TODO: Remove this after rename_axis is supported in cudf # (See: https://github.com/rapidsai/cudf/issues/16895) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index a14ffbc37dc..cf8af82e112 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -12,6 +12,7 @@ from dask.dataframe import assert_eq import dask_cudf +from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -127,7 +128,20 @@ def test_read_parquet_open_file_options_raises(): ) -def test_read_parquet_filesystem(s3_base, s3so, pdf): +@pytest.mark.parametrize( + "filesystem", + [ + pytest.param( + "arrow", + marks=pytest.mark.skipif( + not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + reason="Not supported", + ), + ), + "fsspec", + ], +) +def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem): fname = "test_parquet_filesystem.parquet" bucket = "parquet" buffer = BytesIO() @@ -135,21 +149,24 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf): buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): path = f"s3://{bucket}/{fname}" + if filesystem == "arrow": + # This feature requires arrow >= 15 + pytest.importorskip("pyarrow", minversion="15.0.0") - # Cannot pass filesystem="arrow" - with pytest.raises(ValueError): - dask_cudf.read_parquet( + import pyarrow.fs as pa_fs + + df = dask_cudf.read_parquet( + path, + filesystem=pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ), + ) + else: + df = dask_cudf.read_parquet( path, storage_options=s3so, - filesystem="arrow", + filesystem=filesystem, ) - - # Can pass filesystem="fsspec" - df = dask_cudf.read_parquet( - path, - storage_options=s3so, - filesystem="fsspec", - ) assert df.b.sum().compute() == 9 From c7f6a22bb3edd3cea377d5405ca48a9eee353bc4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:59:58 -1000 Subject: [PATCH 05/26] Add string.attributes APIs to pylibcudf (#16785) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16785 --- python/cudf/cudf/_lib/strings/attributes.pyx | 46 ++++------- .../pylibcudf/strings/CMakeLists.txt | 17 ++++- .../pylibcudf/pylibcudf/strings/__init__.pxd | 19 +++++ .../pylibcudf/pylibcudf/strings/__init__.py | 19 +++++ .../pylibcudf/strings/attributes.pxd | 10 +++ .../pylibcudf/strings/attributes.pyx | 76 +++++++++++++++++++ .../pylibcudf/tests/test_string_attributes.py | 32 ++++++++ 7 files changed, 185 insertions(+), 34 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/attributes.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_attributes.py diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx index fe8c17c9e31..df81b3942b4 100644 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ b/python/cudf/cudf/_lib/strings/attributes.pyx @@ -2,19 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.attributes cimport ( - code_points as cpp_code_points, - count_bytes as cpp_count_bytes, - count_characters as cpp_count_characters, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def count_characters(Column source_strings): @@ -22,13 +13,10 @@ def count_characters(Column source_strings): Returns an integer numeric column containing the length of each string in characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_characters(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_characters( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -37,13 +25,10 @@ def count_bytes(Column source_strings): Returns an integer numeric column containing the number of bytes of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_bytes(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_bytes( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -52,10 +37,7 @@ def code_points(Column source_strings): Creates a numeric column with code point values (integers) for each character of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_code_points(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.code_points( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 77f20b0b917..142bc124ca2 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,8 +13,21 @@ # ============================================================================= set(cython_sources - capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx - regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx + attributes.pyx + capitalize.pyx + case.pyx + char_types.pyx + contains.pyx + extract.pyx + find.pyx + findall.pyx + regex_flags.pyx + regex_program.pyx + repeat.pyx + replace.pyx + side_type.pyx + slice.pyx + strip.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 91d884b294b..d8afccc7336 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . cimport ( + attributes, capitalize, case, char_types, @@ -16,3 +17,21 @@ from . cimport ( strip, ) from .side_type cimport side_type + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "side_type", +] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index b4856784390..22452812e42 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import ( + attributes, capitalize, case, char_types, @@ -17,3 +18,21 @@ strip, ) from .side_type import SideType + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "SideType", +] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd new file mode 100644 index 00000000000..27398766924 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column count_characters(Column source_strings) + +cpdef Column count_bytes(Column source_strings) + +cpdef Column code_points(Column source_strings) diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx new file mode 100644 index 00000000000..36bee7bd1d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport attributes as cpp_attributes + + +cpdef Column count_characters(Column source_strings): + """ + Returns a column containing character lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with lengths for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_characters(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column count_bytes(Column source_strings): + """ + Returns a column containing byte lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with the number of bytes for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_bytes(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column code_points(Column source_strings): + """ + Creates a numeric column with code point values (integers) + for each character of each string. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with code point integer values for each character + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.code_points(source_strings.view())) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py new file mode 100644 index 00000000000..a1820def0b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture() +def str_data(): + pa_data = pa.array(["A", None]) + return pa_data, plc.interop.from_arrow(pa_data) + + +def test_count_characters(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.utf8_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_count_bytes(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.binary_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_code_points(str_data): + result = plc.strings.attributes.code_points(str_data[1]) + exp_data = [ord(str_data[0].to_pylist()[0])] + expected = pa.chunked_array([exp_data], type=pa.int32()) + assert_column_eq(expected, result) From 12ee360048473ddd06019090c7d19c67d6959f7a Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 25 Sep 2024 20:13:45 -0400 Subject: [PATCH 06/26] [REVIEW] JSON host tree algorithms (#16545) Depends on #16836 This change adds a new host tree building algorithms for JSON reader and utf8 field name support. This constructs the device_column_tree using an adjacency list created from parent information. This adjacency list is pruned based on input schema, and also types are enforced as per schema. `mark_is_pruned` Tree is constructed from pruned adjacency list, (with mixed types handling). `construct_tree` utf8 field name support added: (spark requested) utf8 decoding of field names during hashing of field nodes so that utf8 encoded field names also match to same column. All unit tests passes, 1 unit test added where old algorithm fails. This code is kept under experimental flag. Authors: - Shruti Shivakumar (https://github.com/shrshi) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/16545 --- cpp/include/cudf/io/json.hpp | 36 + cpp/src/io/json/host_tree_algorithms.cu | 776 ++++++++++++++++-- cpp/src/io/json/json_column.cu | 46 +- cpp/src/io/json/json_tree.cu | 153 +++- cpp/src/io/json/nested_json.hpp | 29 +- cpp/tests/io/json/json_test.cpp | 53 ++ cpp/tests/io/json/json_tree.cpp | 1 + cpp/tests/io/json/json_tree_csr.cu | 1 + .../main/java/ai/rapids/cudf/JSONOptions.java | 15 + java/src/main/java/ai/rapids/cudf/Table.java | 9 + java/src/main/native/src/TableJni.cpp | 12 +- 11 files changed, 1011 insertions(+), 120 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index ff25a5bacae..6798557e14e 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -105,6 +105,8 @@ class json_reader_options { char _delimiter = '\n'; // Prune columns on read, selected based on the _dtypes option bool _prune_columns = false; + // Experimental features: new column tree construction + bool _experimental = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -277,6 +279,15 @@ class json_reader_options { */ [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; } + /** + * @brief Whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * @return true if experimental features are enabled + */ + [[nodiscard]] bool is_enabled_experimental() const { return _experimental; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -453,6 +464,16 @@ class json_reader_options { */ void enable_prune_columns(bool val) { _prune_columns = val; } + /** + * @brief Set whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * + * @param val Boolean value to enable/disable experimental features + */ + void enable_experimental(bool val) { _experimental = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -695,6 +716,21 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * + * @param val Boolean value to enable/disable experimental features + * @return this for chaining + */ + json_reader_options_builder& experimental(bool val) + { + options._experimental = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 70d61132b42..5855f1b5a5f 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include +#include namespace cudf::io::json::detail { @@ -58,16 +60,15 @@ namespace cudf::io::json::detail { */ rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, tree_meta_t const& d_tree, - device_span col_ids, + device_span col_ids, size_type const num_columns, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto [level2_nodes, level2_indices] = get_array_children_indices( row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), level2_indices.begin(), level2_indices.end(), col_id_location, @@ -90,12 +91,11 @@ std::vector copy_strings_to_host_sync( device_span node_range_end, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); rmm::device_uvector string_offsets(num_strings, stream); rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), d_offset_pairs, d_offset_pairs + num_strings, thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), @@ -161,18 +161,18 @@ std::vector copy_strings_to_host_sync( rmm::device_uvector is_all_nulls_each_column(device_span input, tree_meta_t const& d_column_tree, tree_meta_t const& tree, - device_span col_ids, + device_span col_ids, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { auto const num_nodes = col_ids.size(); auto const num_cols = d_column_tree.node_categories.size(); rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true); auto parse_opt = parsing_options(options, stream); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), num_nodes, [options = parse_opt.view(), @@ -193,7 +193,7 @@ rmm::device_uvector is_all_nulls_each_column(device_span return is_all_nulls; } -NodeIndexT get_row_array_parent_col_id(device_span col_ids, +NodeIndexT get_row_array_parent_col_id(device_span col_ids, bool is_enabled_lines, rmm::cuda_stream_view stream) { @@ -221,33 +221,34 @@ struct json_column_data { bitmask_type* validity; }; -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - std::vector const& is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream); + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); /** * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. + * Newly constructed columns are inserted into `root`'s children. * `root` must be a list type. * * @param input Input JSON string device data @@ -265,28 +266,28 @@ void scatter_offsets( * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); // sort by {col_id} on {node_ids} stable rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + node_ids.begin()); NodeIndexT const row_array_parent_col_id = get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); @@ -316,7 +317,7 @@ void make_device_json_column(device_span input, cudf::detail::make_host_vector_sync(values_column_indices, stream); std::transform(unique_col_ids.begin(), unique_col_ids.end(), - column_names.begin(), + column_names.cbegin(), column_names.begin(), [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( auto col_id, auto name) mutable { @@ -333,17 +334,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); scatter_offsets(tree, col_ids, @@ -356,19 +357,18 @@ void make_device_json_column(device_span input, stream); } -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - std::vector const& is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); @@ -380,6 +380,7 @@ build_tree(device_json_column& root, cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); auto num_columns = d_unique_col_ids.size(); + stream.synchronize(); auto to_json_col_type = [](auto category) { switch (category) { @@ -439,11 +440,12 @@ build_tree(device_json_column& root, }); // use hash map because we may skip field name's col_ids - std::unordered_map> columns; + hashmap_of_device_columns columns; // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::fill(ignore_vals.begin(), ignore_vals.end(), false); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); // for columns that are not mixed type but have been forced as string @@ -452,7 +454,7 @@ build_tree(device_json_column& root, std::function remove_child_columns = [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { + for (auto const& col_name : col.column_order) { auto child_id = mapped_columns[{this_col_id, col_name}]; is_mixed_type_column[child_id] = 1; remove_child_columns(child_id, col.child_columns.at(col_name)); @@ -523,7 +525,7 @@ build_tree(device_json_column& root, if (parent_col_id != parent_node_sentinel && (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } continue; @@ -569,12 +571,12 @@ build_tree(device_json_column& root, } if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; continue; } if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { // remap - ignore_vals[old_col_id] = 1; + ignore_vals[old_col_id] = true; mapped_columns.erase({parent_col_id, name}); columns.erase(old_col_id); parent_col.child_columns.erase(name); @@ -624,7 +626,7 @@ build_tree(device_json_column& root, auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; columns.erase(this_col_id); } // Convert only mixed type columns as string (so to copy), but not its children @@ -644,7 +646,7 @@ build_tree(device_json_column& root, auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; } // Convert only mixed type columns as string (so to copy), but not its children if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and @@ -664,16 +666,15 @@ build_tree(device_json_column& root, return {ignore_vals, columns}; } -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream) +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) { auto const num_nodes = col_ids.size(); auto const num_columns = d_column_tree.node_categories.size(); @@ -695,7 +696,7 @@ void scatter_offsets( // 3. scatter string offsets to respective columns, set validity bits thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), num_nodes, [column_categories = d_column_tree.node_categories.begin(), @@ -739,7 +740,7 @@ void scatter_offsets( : col_ids[parent_node_ids[node_id]]; })); auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + num_nodes, @@ -757,12 +758,12 @@ void scatter_offsets( auto const num_list_children = list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, node_ids.begin()); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), num_list_children, [node_ids = node_ids.begin(), @@ -805,4 +806,599 @@ void scatter_offsets( stream.synchronize(); } +namespace experimental { + +std::map unified_schema(cudf::io::json_reader_options const& options) +{ + return std::visit( + cudf::detail::visitor_overload{ + [](std::vector const& user_dtypes) { + std::map dnew; + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(user_dtypes.size()), + std::inserter(dnew, dnew.end()), + [&user_dtypes](auto i) { + return std::pair(std::to_string(i), schema_element{user_dtypes[i]}); + }); + return dnew; + }, + [](std::map const& user_dtypes) { + std::map dnew; + std::transform(user_dtypes.begin(), + user_dtypes.end(), + std::inserter(dnew, dnew.end()), + [](auto key_dtype) { + return std::pair(key_dtype.first, schema_element{key_dtype.second}); + }); + return dnew; + }, + [](std::map const& user_dtypes) { return user_dtypes; }}, + options.get_dtypes()); +} + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are inserted into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.cbegin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + if (ignore_vals.empty()) return; + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + stream.synchronize(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + + auto initialize_json_columns = [&](auto i, auto& col_ref, auto column_category) { + auto& col = col_ref.get(); + if (col.type != json_col_t::Unknown) { return; } + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + thrust::fill( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(col.string_offsets.begin(), col.string_lengths.begin()), + thrust::make_zip_iterator(col.string_offsets.end(), col.string_lengths.end()), + thrust::make_tuple(0, 0)); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), 0); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + // adjacency list construction + std::map> adj; + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + adj[parent_col_id].push_back(this_col_id); + } + + // Pruning + auto is_pruned = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns()); + + // prune all children of a column, but not self. + auto ignore_all_children = [&](auto parent_col_id) { + std::deque offspring; + if (adj.count(parent_col_id)) { + for (auto const& child : adj[parent_col_id]) { + offspring.push_back(child); + } + } + while (!offspring.empty()) { + auto this_id = offspring.front(); + offspring.pop_front(); + is_pruned[this_id] = true; + if (adj.count(this_id)) { + for (auto const& child : adj[this_id]) { + offspring.push_back(child); + } + } + } + }; + + // Pruning: iterate through schema and mark only those columns and enforce type. + // NoPruning: iterate through schema and enforce type. + + if (adj[parent_node_sentinel].empty()) + return {cudf::detail::make_host_vector(0, stream), {}}; // for empty file + CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); + auto expected_types = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); + + auto lookup_names = [&column_names](auto child_ids, auto name) { + for (auto const& child_id : child_ids) { + if (column_names[child_id] == name) return child_id; + } + return -1; + }; + // recursive lambda on schema to mark columns as pruned. + std::function mark_is_pruned; + mark_is_pruned = [&is_pruned, + &mark_is_pruned, + &adj, + &lookup_names, + &column_categories, + &expected_types, + &ignore_all_children](NodeIndexT root, schema_element const& schema) -> void { + if (root == -1) return; + bool pass = + (schema.type == data_type{type_id::STRUCT} and column_categories[root] == NC_STRUCT) or + (schema.type == data_type{type_id::LIST} and column_categories[root] == NC_LIST) or + (schema.type != data_type{type_id::STRUCT} and schema.type != data_type{type_id::LIST} and + column_categories[root] != NC_FN); + if (!pass) { + // ignore all children of this column and prune this column. + is_pruned[root] = true; + ignore_all_children(root); + return; + } + is_pruned[root] = false; + auto expected_type = [](auto type, auto cat) { + if (type == data_type{type_id::STRUCT} and cat == NC_STRUCT) return NC_STRUCT; + if (type == data_type{type_id::LIST} and cat == NC_LIST) return NC_LIST; + if (type != data_type{type_id::STRUCT} and type != data_type{type_id::LIST}) return NC_STR; + return NC_ERR; + }(schema.type, column_categories[root]); + expected_types[root] = expected_type; // forced type. + // ignore children of nested columns, but not self. + if (expected_type == NC_STR and + (column_categories[root] == NC_STRUCT or column_categories[root] == NC_LIST)) + ignore_all_children(root); + if (not(schema.type == data_type{type_id::STRUCT} or schema.type == data_type{type_id::LIST})) + return; // no children to mark for non-nested. + auto child_ids = adj.count(root) ? adj[root] : std::vector{}; + if (schema.type == data_type{type_id::STRUCT}) { + for (auto const& key_pair : schema.child_types) { + auto col_id = lookup_names(child_ids, key_pair.first); + if (col_id == -1) continue; + is_pruned[col_id] = false; + for (auto const& child_id : adj[col_id]) // children of field (>1 if mixed) + mark_is_pruned(child_id, key_pair.second); + } + } else if (schema.type == data_type{type_id::LIST}) { + // partial solution for list children to have any name. + auto this_list_child_name = + schema.child_types.size() == 1 ? schema.child_types.begin()->first : list_child_name; + if (schema.child_types.count(this_list_child_name) == 0) return; + auto list_child = schema.child_types.at(this_list_child_name); + for (auto const& child_id : child_ids) + mark_is_pruned(child_id, list_child); + } + }; + if (is_array_of_arrays) { + if (adj[adj[parent_node_sentinel][0]].empty()) + return {cudf::detail::make_host_vector(0, stream), {}}; + auto root_list_col_id = + is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0]; + // mark root and row array col_id as not pruned. + if (!is_enabled_lines) { + auto top_level_list_id = adj[parent_node_sentinel][0]; + is_pruned[top_level_list_id] = false; + } + is_pruned[root_list_col_id] = false; + std::visit(cudf::detail::visitor_overload{ + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::vector const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); + i++) { + NodeIndexT const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + auto value_id = std::stol(name); + if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) + mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); + // Note: mixed type - forced type, will work here. + } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::map const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + if (user_dtypes.count(name)) + mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); + } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::map const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + if (user_dtypes.count(name)) + mark_is_pruned(first_child_id, user_dtypes.at(name)); + } + }}, + options.get_dtypes()); + } else { + auto root_struct_col_id = + is_enabled_lines + ? adj[parent_node_sentinel][0] + : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + // mark root and row struct col_id as not pruned. + if (!is_enabled_lines) { + auto top_level_list_id = adj[parent_node_sentinel][0]; + is_pruned[top_level_list_id] = false; + } + is_pruned[root_struct_col_id] = false; + schema_element u_schema{data_type{type_id::STRUCT}}; + u_schema.child_types = unified_schema(options); + std::visit( + cudf::detail::visitor_overload{ + [&is_pruned, &root_struct_col_id, &adj, &mark_is_pruned]( + std::vector const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_struct_col_id].size() && i < user_dtypes.size(); i++) { + NodeIndexT const first_field_id = adj[root_struct_col_id][i]; + is_pruned[first_field_id] = false; + for (auto const& child_id : adj[first_field_id]) // children of field (>1 if mixed) + mark_is_pruned(child_id, schema_element{user_dtypes[i]}); + } + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( + std::map const& user_dtypes) -> void { + mark_is_pruned(root_struct_col_id, u_schema); + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( + std::map const& user_dtypes) -> void { + mark_is_pruned(root_struct_col_id, u_schema); + }}, + options.get_dtypes()); + } + // Useful for array of arrays + auto named_level = + is_enabled_lines + ? adj[parent_node_sentinel][0] + : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + + auto handle_mixed_types = [&column_categories, + &is_str_column_all_nulls, + &is_pruned, + &expected_types, + &is_enabled_mixed_types_as_string, + &ignore_all_children](std::vector& child_ids) { + // do these on unpruned columns only. + // when mixed types is disabled, ignore string sibling of nested column. + // when mixed types is disabled, and both list and struct columns are siblings, error out. + // when mixed types is enabled, force string type on all columns + + // Remove pruned children (forced type will not clash here because other types are already + // pruned) + child_ids.erase( + std::remove_if(child_ids.begin(), + child_ids.end(), + [&is_pruned](NodeIndexT child_id) { return is_pruned[child_id]; }), + child_ids.end()); + // find string id, struct id, list id. + NodeIndexT str_col_id{-1}, struct_col_id{-1}, list_col_id{-1}; + for (auto const& child_id : child_ids) { + if (column_categories[child_id] == NC_VAL || column_categories[child_id] == NC_STR) + str_col_id = child_id; + else if (column_categories[child_id] == NC_STRUCT) + struct_col_id = child_id; + else if (column_categories[child_id] == NC_LIST) + list_col_id = child_id; + } + // conditions for handling mixed types. + if (is_enabled_mixed_types_as_string) { + if (struct_col_id != -1 and list_col_id != -1) { + expected_types[struct_col_id] = NC_STR; + expected_types[list_col_id] = NC_STR; + // ignore children of nested columns. + ignore_all_children(struct_col_id); + ignore_all_children(list_col_id); + } + if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) { + if (is_str_column_all_nulls[str_col_id]) + is_pruned[str_col_id] = true; + else { + // ignore children of nested columns. + if (struct_col_id != -1) { + expected_types[struct_col_id] = NC_STR; + ignore_all_children(struct_col_id); + } + if (list_col_id != -1) { + expected_types[list_col_id] = NC_STR; + ignore_all_children(list_col_id); + } + } + } + } else { + // if both are present, error out. + CUDF_EXPECTS(struct_col_id == -1 or list_col_id == -1, + "A mix of lists and structs within the same column is not supported"); + // either one only: so ignore str column. + if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) { + is_pruned[str_col_id] = true; + } + } + }; + + using dev_ref = std::reference_wrapper; + std::unordered_map columns; + columns.try_emplace(parent_node_sentinel, std::ref(root)); + // convert adjaceny list to tree. + dev_ref parent_ref = std::ref(root); + // creates children column + std::function construct_tree; + construct_tree = [&](NodeIndexT root, dev_ref ref) -> void { + if (is_pruned[root]) return; + auto expected_category = + expected_types[root] == NUM_NODE_CLASSES ? column_categories[root] : expected_types[root]; + initialize_json_columns(root, ref, expected_category); + auto child_ids = adj.count(root) ? adj[root] : std::vector{}; + if (expected_category == NC_STRUCT) { + // find field column ids, and its children and create columns. + for (auto const& field_id : child_ids) { + auto name = column_names[field_id]; + if (is_pruned[field_id]) continue; + auto inserted = + ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; + ref.get().column_order.emplace_back(name); + CUDF_EXPECTS(inserted, + "struct child column insertion failed, duplicate column name in the parent"); + auto this_ref = std::ref(ref.get().child_columns.at(name)); + // Mixed type handling + auto& value_col_ids = adj[field_id]; + handle_mixed_types(value_col_ids); + if (value_col_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(name); + ref.get().column_order.pop_back(); + continue; + } + for (auto const& child_id : value_col_ids) // children of field (>1 if mixed) + { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } else if (expected_category == NC_LIST) { + // array of arrays interpreted as array of structs. + if (is_array_of_arrays and root == named_level) { + // create column names + std::map> array_values; + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) continue; + auto name = column_names[child_id]; + array_values[std::stoi(name)].push_back(child_id); + } + // + for (auto const& value_id_pair : array_values) { + auto [value_id, value_col_ids] = value_id_pair; + auto name = std::to_string(value_id); + auto inserted = + ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; + ref.get().column_order.emplace_back(name); + CUDF_EXPECTS(inserted, + "list child column insertion failed, duplicate column name in the parent"); + auto this_ref = std::ref(ref.get().child_columns.at(name)); + handle_mixed_types(value_col_ids); + if (value_col_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(name); + ref.get().column_order.pop_back(); + continue; + } + for (auto const& child_id : value_col_ids) // children of field (>1 if mixed) + { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } else { + if (child_ids.empty()) return; + auto inserted = + ref.get() + .child_columns.try_emplace(list_child_name, device_json_column(stream, mr)) + .second; + CUDF_EXPECTS(inserted, + "list child column insertion failed, duplicate column name in the parent"); + ref.get().column_order.emplace_back(list_child_name); + auto this_ref = std::ref(ref.get().child_columns.at(list_child_name)); + // Mixed type handling + handle_mixed_types(child_ids); + if (child_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(list_child_name); + } + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } + }; + auto inserted = parent_ref.get() + .child_columns.try_emplace(list_child_name, device_json_column(stream, mr)) + .second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + parent_ref = std::ref(parent_ref.get().child_columns.at(list_child_name)); + columns.try_emplace(adj[parent_node_sentinel][0], parent_ref); + construct_tree(adj[parent_node_sentinel][0], parent_ref); + + // Forced string type due to input schema and mixed type as string. + for (size_t i = 0; i < expected_types.size(); i++) { + if (expected_types[i] == NC_STR) { + if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; } + } + } + std::transform(expected_types.cbegin(), + expected_types.cend(), + column_categories.cbegin(), + expected_types.begin(), + [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + expected_types.data(), + expected_types.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); + + return {is_pruned, columns}; +} +} // namespace experimental + } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index dfd9285f682..912e93d52ae 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -104,7 +104,7 @@ void print_tree(host_span input, * max row offsets of columns */ std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, +reduce_to_column_tree(tree_meta_t const& tree, device_span original_col_ids, device_span sorted_col_ids, device_span ordered_node_ids, @@ -317,7 +317,7 @@ std::pair, std::vector> device_json_co // Note: json_col modified here, moves this memory }; - auto get_child_schema = [schema](auto child_name) -> std::optional { + auto get_child_schema = [&schema](auto child_name) -> std::optional { if (schema.has_value()) { auto const result = schema.value().child_types.find(child_name); if (result != std::end(schema.value().child_types)) { return result->second; } @@ -325,6 +325,13 @@ std::pair, std::vector> device_json_co return {}; }; + auto get_list_child_schema = [&schema]() -> std::optional { + if (schema.has_value()) { + if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second; + } + return {}; + }; + switch (json_col.type) { case json_col_t::StringColumn: { // move string_offsets to GPU and transform to string column @@ -439,9 +446,8 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0); // Create children column - auto child_schema_element = json_col.child_columns.empty() - ? std::optional{} - : get_child_schema(json_col.child_columns.begin()->first); + auto child_schema_element = + json_col.child_columns.empty() ? std::optional{} : get_list_child_schema(); auto [child_column, names] = json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) ? std::pair, @@ -479,6 +485,16 @@ std::pair, std::vector> device_json_co } } +template +auto make_device_json_column_dispatch(bool experimental, Args&&... args) +{ + if (experimental) { + return experimental::make_device_json_column(std::forward(args)...); + } else { + return make_device_json_column(std::forward(args)...); + } +} + table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, gpu_tree, is_array_of_arrays, options.is_enabled_lines(), + options.is_enabled_experimental(), stream, cudf::get_current_device_resource_ref()); @@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - make_device_json_column(d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); + make_device_json_column_dispatch(options.is_enabled_experimental(), + d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 4d0dc010c57..d949635c1cc 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -14,17 +14,18 @@ * limitations under the License. */ -#include "io/utilities/hostdevice_vector.hpp" +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" #include "nested_json.hpp" #include #include -#include #include #include #include #include #include +#include #include #include #include @@ -34,12 +35,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -492,6 +495,85 @@ tree_meta_t get_tree_representation(device_span tokens, std::move(node_range_end)}; } +// Return field node ids after unicode decoding of field names and matching them to same field names +std::pair> remapped_field_nodes_after_unicode_decode( + device_span d_input, + tree_meta_t const& d_tree, + device_span keys, + rmm::cuda_stream_view stream) +{ + size_t num_keys = keys.size(); + if (num_keys == 0) { return {num_keys, rmm::device_uvector(num_keys, stream)}; } + rmm::device_uvector offsets(num_keys, stream); + rmm::device_uvector lengths(num_keys, stream); + auto offset_length_it = thrust::make_zip_iterator(offsets.begin(), lengths.begin()); + thrust::transform(rmm::exec_policy_nosync(stream), + keys.begin(), + keys.end(), + offset_length_it, + [node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto key) { + return thrust::make_tuple(node_range_begin[key], + node_range_end[key] - node_range_begin[key]); + }); + cudf::io::parse_options_view opt{',', '\n', '\0', '.'}; + opt.keepquotes = true; + + auto utf8_decoded_fields = parse_data(d_input.data(), + offset_length_it, + num_keys, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + opt, + stream, + cudf::get_current_device_resource_ref()); + // hash using iter, create a hashmap for 0-num_keys. + // insert and find. -> array + // store to static_map with keys as field key[index], and values as key[array[index]] + + auto str_view = strings_column_view{utf8_decoded_fields->view()}; + auto const char_ptr = str_view.chars_begin(stream); + auto const offset_ptr = str_view.offsets().begin(); + + // String hasher + auto const d_hasher = cuda::proclaim_return_type< + typename cudf::hashing::detail::default_hash::result_type>( + [char_ptr, offset_ptr] __device__(auto node_id) { + auto const field_name = cudf::string_view(char_ptr + offset_ptr[node_id], + offset_ptr[node_id + 1] - offset_ptr[node_id]); + return cudf::hashing::detail::default_hash{}(field_name); + }); + auto const d_equal = [char_ptr, offset_ptr] __device__(auto node_id1, auto node_id2) { + auto const field_name1 = cudf::string_view(char_ptr + offset_ptr[node_id1], + offset_ptr[node_id1 + 1] - offset_ptr[node_id1]); + auto const field_name2 = cudf::string_view(char_ptr + offset_ptr[node_id2], + offset_ptr[node_id2 + 1] - offset_ptr[node_id2]); + return field_name1 == field_name2; + }; + + using hasher_type = decltype(d_hasher); + constexpr size_type empty_node_index_sentinel = -1; + auto key_set = cuco::static_set{ + cuco::extent{compute_hash_table_size(num_keys)}, + cuco::empty_key{empty_node_index_sentinel}, + d_equal, + cuco::linear_probing<1, hasher_type>{d_hasher}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + auto const counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector found_keys(num_keys, stream); + key_set.insert_and_find_async(counting_iter, + counting_iter + num_keys, + found_keys.begin(), + thrust::make_discard_iterator(), + stream.value()); + // set.size will synchronize the stream before return. + return {key_set.size(stream), std::move(found_keys)}; +} + /** * @brief Generates unique node_type id for each node. * Field nodes with the same name are assigned the same node_type id. @@ -500,11 +582,14 @@ tree_meta_t get_tree_representation(device_span tokens, * All inputs and outputs are in node_id order. * @param d_input JSON string in device memory * @param d_tree Tree representation of the JSON + * @param is_enabled_experimental Whether to enable experimental features such as + * utf8 field name support * @param stream CUDA stream used for device memory operations and kernel launches. * @return Vector of node_type ids */ rmm::device_uvector hash_node_type_with_field_name(device_span d_input, tree_meta_t const& d_tree, + bool is_enabled_experimental, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -536,7 +621,7 @@ rmm::device_uvector hash_node_type_with_field_name(device_span(0); + auto const counting_iter = thrust::make_counting_iterator(0); auto const is_field_name_node = [node_categories = d_tree.node_categories.data()] __device__(auto node_id) { @@ -554,15 +639,61 @@ rmm::device_uvector hash_node_type_with_field_name(device_span{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; - key_set.insert_if_async(iter, - iter + num_nodes, + key_set.insert_if_async(counting_iter, + counting_iter + num_nodes, thrust::counting_iterator(0), // stencil is_field_name_node, stream.value()); + // experimental feature: utf8 field name support + // parse_data on field names, + // rehash it using another map, + // reassign the reverse map values to new matched node indices. + auto get_utf8_matched_field_nodes = [&]() { + auto make_map = [&stream](auto num_keys) { + using hasher_type3 = cudf::hashing::detail::default_hash; + return cuco::static_map{ + cuco::extent{compute_hash_table_size(num_keys, 100)}, // 100% occupancy + cuco::empty_key{empty_node_index_sentinel}, + cuco::empty_value{empty_node_index_sentinel}, + {}, + cuco::linear_probing<1, hasher_type3>{hasher_type3{}}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + }; + if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; } + // get all unique field node ids for utf8 decoding + auto num_keys = key_set.size(stream); + rmm::device_uvector keys(num_keys, stream); + key_set.retrieve_all(keys.data(), stream.value()); + + auto [num_unique_fields, found_keys] = + remapped_field_nodes_after_unicode_decode(d_input, d_tree, keys, stream); + + auto is_need_remap = num_unique_fields != num_keys; + if (!is_need_remap) { return std::pair{false, make_map(0)}; } + + // store to static_map with keys as field keys[index], and values as keys[found_keys[index]] + auto reverse_map = make_map(num_keys); + auto matching_keys_iter = thrust::make_permutation_iterator(keys.begin(), found_keys.begin()); + auto pair_iter = + thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), matching_keys_iter)); + reverse_map.insert_async(pair_iter, pair_iter + num_keys, stream); + return std::pair{is_need_remap, std::move(reverse_map)}; + }; + auto [is_need_remap, reverse_map] = get_utf8_matched_field_nodes(); + auto const get_hash_value = - [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type { + [key_set = key_set.ref(cuco::op::find), + is_need_remap = is_need_remap, + rm = reverse_map.ref(cuco::op::find)] __device__(auto node_id) -> size_type { auto const it = key_set.find(node_id); + if (it != key_set.end() and is_need_remap) { + auto const it2 = rm.find(*it); + return (it2 == rm.end()) ? size_type{0} : it2->second; + } return (it == key_set.end()) ? size_type{0} : *it; }; @@ -771,6 +902,8 @@ std::pair, rmm::device_uvector> hash_n * @param d_tree Tree representation of the JSON * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_experimental Whether the experimental feature is enabled such as + * utf8 field name support * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column_id, parent_column_id @@ -780,6 +913,7 @@ std::pair, rmm::device_uvector> gene tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -793,7 +927,7 @@ std::pair, rmm::device_uvector> gene auto [col_id, unique_keys] = [&]() { // Convert node_category + field_name to node_type. rmm::device_uvector node_type = - hash_node_type_with_field_name(d_input, d_tree, stream); + hash_node_type_with_field_name(d_input, d_tree, is_enabled_experimental, stream); // hash entire path from node to root. return hash_node_path(d_tree.node_levels, @@ -948,12 +1082,13 @@ records_orient_tree_traversal(device_span d_input, tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto [new_col_id, new_parent_col_id] = - generate_column_id(d_input, d_tree, is_array_of_arrays, is_enabled_lines, stream, mr); + auto [new_col_id, new_parent_col_id] = generate_column_id( + d_input, d_tree, is_array_of_arrays, is_enabled_lines, is_enabled_experimental, stream, mr); auto row_offsets = compute_row_offsets( std::move(new_parent_col_id), d_tree, is_array_of_arrays, is_enabled_lines, stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 93ef2b46be1..3d9a51833e0 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -316,6 +316,8 @@ tree_meta_t get_tree_representation(device_span tokens, * index, level, begin index, and end index in the input JSON string * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_experimental Whether to enable experimental features such as utf-8 field name + * support * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate * @return A tuple of the output column indices and the row offsets within each column for each node @@ -326,6 +328,7 @@ records_orient_tree_traversal(device_span d_input, tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -352,7 +355,7 @@ get_array_children_indices(TreeDepthT row_array_children_level, /** * @brief Reduces node tree representation to column tree representation. * - * @param node_tree Node tree representation of JSON string + * @param tree Node tree representation of JSON string * @param original_col_ids Column ids of nodes * @param sorted_col_ids Sorted column ids of nodes * @param ordered_node_ids Node ids of nodes sorted by column ids @@ -365,7 +368,7 @@ get_array_children_indices(TreeDepthT row_array_children_level, */ CUDF_EXPORT std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& node_tree, +reduce_to_column_tree(tree_meta_t const& tree, device_span original_col_ids, device_span sorted_col_ids, device_span ordered_node_ids, @@ -393,14 +396,30 @@ reduce_to_column_tree(tree_meta_t& node_tree, * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +namespace experimental { +/** + * @copydoc cudf::io::json::detail::make_device_json_column + */ +void make_device_json_column(device_span input, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace experimental + /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 48bc982d0e3..68ec255b39d 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,6 +2856,59 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, MixedTypesWithSchema) +{ + std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n"; + + std::map data_types; + std::map child_types; + child_types.insert( + std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}}); + data_types.insert(std::pair{ + "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}}); + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(data_types) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(true) + .mixed_types_as_string(true) + .experimental(true) + .keep_quotes(true) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 2); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::STRING); +} + +TEST_F(JsonReaderTest, UnicodeFieldname) +{ + // unicode at nested and leaf levels + std::string data = R"({"data": {"a": 0, "b c": 1}} + {"data": {"\u0061": 2, "\u0062\tc": 3}} + {"d\u0061ta": {"a": 4}})"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .experimental(true) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 3); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).num_children(), 2); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.metadata.schema_info.at(0).name, "data"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.at(0).name, "a"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.at(1).name, "b\tc"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.size(), 2); +} + TEST_F(JsonReaderTest, JsonDtypeSchema) { std::string data = R"( diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 875cc467b6a..15682c6ae6b 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -889,6 +889,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal) gpu_tree, is_array_of_arrays, json_lines, + false, stream, cudf::get_current_device_resource_ref()); #if LIBCUDF_JSON_DEBUG_DUMP diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index a336b327732..f988ae24b38 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -168,6 +168,7 @@ void run_test(std::string const& input, bool enable_lines = true) gpu_tree, is_array_of_arrays, options.is_enabled_lines(), + false, stream, rmm::mr::get_current_device_resource()); auto& gpu_col_id = std::get<0>(tup); diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 2bb74c3e3b1..e41cc15712f 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -39,6 +39,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; private final boolean cudfPruneSchema; + private final boolean experimental; private final byte lineDelimiter; private JSONOptions(Builder builder) { @@ -55,6 +56,7 @@ private JSONOptions(Builder builder) { allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; cudfPruneSchema = builder.cudfPruneSchema; + experimental = builder.experimental; lineDelimiter = builder.lineDelimiter; } @@ -111,6 +113,10 @@ public boolean unquotedControlChars() { return allowUnquotedControlChars; } + public boolean experimental() { + return experimental; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -136,6 +142,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(line_delimiter)) .strict_validation(strict_validation) + .experimental(experimental) .keep_quotes(keep_quotes) .prune_columns(false); if (strict_validation) { @@ -1680,6 +1682,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean experimental, jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); @@ -1705,6 +1708,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) .prune_columns(false) + .experimental(experimental) .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { @@ -1821,6 +1825,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, jboolean prune_columns, + jboolean experimental, jbyte line_delimiter, jlong ds_handle) { @@ -1859,7 +1864,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns); + .prune_columns(prune_columns) + .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1920,6 +1926,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, jboolean prune_columns, + jboolean experimental, jbyte line_delimiter) { bool read_buffer = true; @@ -1972,7 +1979,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns); + .prune_columns(prune_columns) + .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) From 61af76978e97d94c1c9c7297fc73900d7827b433 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:48:51 -1000 Subject: [PATCH 07/26] Add io/timezone APIs to pylibcudf (#16771) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16771 --- .../api_docs/pylibcudf/io/index.rst | 1 + .../api_docs/pylibcudf/io/timezone.rst | 6 +++ python/cudf/cudf/_lib/timezone.pyx | 27 ++---------- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 4 +- python/pylibcudf/pylibcudf/io/__init__.pxd | 2 +- python/pylibcudf/pylibcudf/io/__init__.py | 2 +- python/pylibcudf/pylibcudf/io/timezone.pxd | 6 +++ python/pylibcudf/pylibcudf/io/timezone.pyx | 43 +++++++++++++++++++ .../pylibcudf/tests/io/test_timezone.py | 16 +++++++ 9 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pxd create mode 100644 python/pylibcudf/pylibcudf/io/timezone.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_timezone.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index c8933981736..53638f071cc 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,3 +19,4 @@ I/O Functions csv json parquet + timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst new file mode 100644 index 00000000000..20c1ffc2e93 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst @@ -0,0 +1,6 @@ +======== +Timezone +======== + +.. automodule:: pylibcudf.io.timezone + :members: diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index bff3b2c4ce4..54624a5a2fd 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,29 +1,10 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.optional cimport make_optional -from libcpp.string cimport string -from libcpp.utility cimport move +import pylibcudf as plc -from pylibcudf.libcudf.io.timezone cimport ( - make_timezone_transition_table as cpp_make_timezone_transition_table, -) -from pylibcudf.libcudf.table.table cimport table - -from cudf._lib.utils cimport columns_from_unique_ptr +from cudf._lib.column cimport Column def make_timezone_transition_table(tzdir, tzname): - cdef unique_ptr[table] c_result - cdef string c_tzdir = tzdir.encode() - cdef string c_tzname = tzname.encode() - - with nogil: - c_result = move( - cpp_make_timezone_transition_table( - make_optional[string](c_tzdir), - c_tzname - ) - ) - - return columns_from_unique_ptr(move(c_result)) + plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname) + return [Column.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 529a71a48ce..965724a47b1 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx types.pyx) +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx + types.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 5927a19dc69..1bcc0a3f963 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, orc, parquet, types +from . cimport avro, datasource, json, orc, parquet, timezone, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 5d899ee0808..2e4f215b12c 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, orc, parquet, types +from . import avro, csv, datasource, json, orc, parquet, timezone, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd new file mode 100644 index 00000000000..2aa755dbbd8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from ..table cimport Table + + +cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name) diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx new file mode 100644 index 00000000000..e02239d7252 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.optional cimport make_optional +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.libcudf.io.timezone cimport ( + make_timezone_transition_table as cpp_make_timezone_transition_table, +) +from pylibcudf.libcudf.table.table cimport table + +from ..table cimport Table + + +cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): + """ + Creates a transition table to convert ORC timestamps to UTC. + + Parameters + ---------- + tzif_dir : str + The directory where the TZif files are located + timezone_name : str + standard timezone name + + Returns + ------- + Table + The transition table for the given timezone. + """ + cdef unique_ptr[table] c_result + cdef string c_tzdir = tzif_dir.encode() + cdef string c_tzname = timezone_name.encode() + + with nogil: + c_result = move( + cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py new file mode 100644 index 00000000000..76b0424b2af --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import zoneinfo + +import pylibcudf as plc +import pytest + + +def test_make_timezone_transition_table(): + if len(zoneinfo.TZPATH) == 0: + pytest.skip("No TZPATH available.") + tz_path = zoneinfo.TZPATH[0] + result = plc.io.timezone.make_timezone_transition_table( + tz_path, "America/Los_Angeles" + ) + assert isinstance(result, plc.Table) + assert result.num_rows() > 0 From b00a718a7980fadc91c8b37d6bbe829e4b8549e8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:51:18 -1000 Subject: [PATCH 08/26] Add partitioning APIs to pylibcudf (#16781) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16781 --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/partitioning.rst | 6 + python/cudf/cudf/_lib/hash.pyx | 35 ++--- python/cudf/cudf/_lib/partitioning.pyx | 35 +---- python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 2 + python/pylibcudf/pylibcudf/__init__.py | 2 + .../pylibcudf/libcudf/partitioning.pxd | 7 + python/pylibcudf/pylibcudf/partitioning.pxd | 19 +++ python/pylibcudf/pylibcudf/partitioning.pyx | 120 ++++++++++++++++++ .../pylibcudf/tests/test_partitioning.py | 55 ++++++++ 11 files changed, 229 insertions(+), 54 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst create mode 100644 python/pylibcudf/pylibcudf/partitioning.pxd create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_partitioning.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index edb0963ed29..e21536e2e97 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -25,6 +25,7 @@ This page provides API documentation for pylibcudf. lists merge null_mask + partitioning quantiles reduce replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst new file mode 100644 index 00000000000..6951dbecca0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst @@ -0,0 +1,6 @@ +============ +partitioning +============ + +.. automodule:: pylibcudf.partitioning + :members: diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 48f75b12a73..9b7ab0888d2 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -3,11 +3,8 @@ from cudf.core.buffer import acquire_spill_lock from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair from libcpp.utility cimport move -from libcpp.vector cimport vector -cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.hash cimport ( md5, @@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport ( sha512, xxhash_64, ) -from pylibcudf.libcudf.partitioning cimport ( - hash_partition as cpp_hash_partition, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() -def hash_partition(list source_columns, object columns_to_hash, +def hash_partition(list source_columns, list columns_to_hash, int num_partitions): - cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result - with nogil: - c_result = move( - cpp_hash_partition( - c_source_view, - c_columns_to_hash, - c_num_partitions - ) - ) - - return ( - columns_from_unique_ptr(move(c_result.first)), - list(c_result.second) + plc_table, offsets = plc.partitioning.hash_partition( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + columns_to_hash, + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index d94f0e1b564..13997da8403 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -2,24 +2,13 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.partitioning cimport partition as cpp_partition -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns + +import pylibcudf as plc from cudf._lib.reduce import minmax from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count -cimport pylibcudf.libcudf.types as libcudf_types - @acquire_spill_lock() def partition(list source_columns, Column partition_map, @@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map, if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef column_view c_partition_map_view = partition_map.view() - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result if partition_map.size > 0: lo, hi = minmax(partition_map) if lo < 0 or hi >= num_partitions: raise ValueError("Partition map has invalid values") - with nogil: - c_result = move( - cpp_partition( - c_source_view, - c_partition_map_view, - c_num_partitions - ) - ) - return ( - columns_from_unique_ptr(move(c_result.first)), list(c_result.second) + plc_table, offsets = plc.partitioning.partition( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + partition_map.to_pylibcudf(mode="read"), + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index fb3a6c13a70..a7cb66d7b16 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -31,6 +31,7 @@ set(cython_sources lists.pyx merge.pyx null_mask.pyx + partitioning.pyx quantiles.pyx reduce.pyx replace.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index 66d9c3d6165..a384edd456d 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -61,6 +62,7 @@ __all__ = [ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 0a3615fa941..2a5365e8fad 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -75,6 +76,7 @@ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index 1ea10e8a194..89bddbffab5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: const column_view& partition_map, int num_partitions ) except + + + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ + round_robin_partition "cudf::round_robin_partition" ( + const table_view& input, + int num_partitions, + int start_partition + ) except + diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd new file mode 100644 index 00000000000..aad60149fc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +) + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions) + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=* +) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx new file mode 100644 index 00000000000..8fa70daab5a --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -0,0 +1,120 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +cimport pylibcudf.libcudf.types as libcudf_types +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.libcudf cimport partitioning as cpp_partitioning +from pylibcudf.libcudf.table.table cimport table + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +): + """ + Partitions rows from the input table into multiple output tables. + + For details, see :cpp:func:`hash_partition`. + + Parameters + ---------- + input : Table + The table to partition + columns_to_hash : list[int] + Indices of input columns to hash + num_partitions : int + The number of partitions to use + + Returns + ------- + tuple[Table, list[int]] + An output table and a vector of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash + cdef int c_num_partitions = num_partitions + + with nogil: + c_result = move( + cpp_partitioning.hash_partition( + input.view(), c_columns_to_hash, c_num_partitions + ) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions): + """ + Partitions rows of `t` according to the mapping specified by `partition_map`. + + For details, see :cpp:func:`partition`. + + Parameters + ---------- + t : Table + The table to partition + partition_map : Column + Non-nullable column of integer values that map each row + in `t` to it's partition. + num_partitions : int + The total number of partitions + + Returns + ------- + tuple[Table, list[int]] + An output table and a list of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions + + with nogil: + c_result = move( + cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) + + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=0 +): + """ + Round-robin partition. + + For details, see :cpp:func:`round_robin_partition`. + + Parameters + ---------- + input : Table + The input table to be round-robin partitioned + num_partitions : int + Number of partitions for the table + start_partition : int, default 0 + Index of the 1st partition + + Returns + ------- + tuple[Table, list[int]] + The partitioned table and the partition offsets + for each partition within the table. + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions + cdef int c_start_partition = start_partition + + with nogil: + c_result = move( + cpp_partitioning.round_robin_partition( + input.view(), c_num_partitions, c_start_partition + ) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py new file mode 100644 index 00000000000..444d0089d2c --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture(scope="module") +def partitioning_data(): + data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]} + pa_table = pa.table(data) + plc_table = plc.interop.from_arrow(pa_table) + return data, plc_table, pa_table + + +def test_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.partition( + plc_table, + plc.interop.from_arrow(pa.array([0, 0, 0])), + 1, + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0, 3] + + +def test_hash_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.hash_partition( + plc_table, [0, 1], 1 + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0] + + +def test_round_robin_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.round_robin_partition( + plc_table, 1, 0 + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0] From b1e1c9c060cc6b4b35b8590209177584336444bc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 26 Sep 2024 11:00:00 -0700 Subject: [PATCH 09/26] Reapply `mixed_semi_join` refactoring and bug fixes (#16859) This PR reapplies changes from #16230 and adds bug fixes and performance improvements for mixed_semi_join. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16859 --- cpp/src/join/join_common_utils.hpp | 6 - cpp/src/join/mixed_join_common_utils.cuh | 34 ++++++ cpp/src/join/mixed_join_kernels_semi.cu | 51 ++++---- cpp/src/join/mixed_join_kernels_semi.cuh | 6 +- cpp/src/join/mixed_join_semi.cu | 92 +++++--------- cpp/tests/join/mixed_join_tests.cu | 147 +++++++++++++++++++++++ 6 files changed, 239 insertions(+), 97 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 86402a0e7de..573101cefd9 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -51,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using semi_map_type = cuco::legacy::static_map>; - using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 19701816867..4a52cfe098a 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,6 +25,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality { } }; +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +struct double_row_equality_comparator { + row_equality const equality_comparator; + row_equality const conditional_comparator; + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } +}; + +// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi. +auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1; + +// The hash set type used by mixed_semi_join with the build_table. +using hash_set_type = + cuco::static_set, + cuda::thread_scope_device, + double_row_equality_comparator, + cuco::linear_probing, + cudf::detail::cuco_allocator, + cuco::storage<1>>; + +// The hash_set_ref_type used by mixed_semi_join kerenels for probing. +using hash_set_ref_type = hash_set_type::ref_type; + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 7459ac3e99c..bd8c80652a0 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { + auto constexpr cg_size = hash_set_ref_type::cg_size; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between // different template instantiations due to the extern specifier. extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = + auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; + intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates); - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = left_num_rows; + // Equality evaluator to use + auto const evaluator = cudf::ast::detail::expression_evaluator( + left_table, right_table, device_expression_data); - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + // Make sure to swap_tables here as hash_set will use probe table as the left one + auto constexpr swap_tables = true; + auto const equality = single_expression_equality{ + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); + // Create set ref with the new equality comparator + auto const set_ref_equality = set_ref.with_key_eq(equality); - if (outer_row_index < outer_num_rows) { - // Figure out the number of elements for this key. - auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, false, equality_probe}; + // Total number of rows to query the set + auto const outer_num_rows = left_table.num_rows(); + // Grid stride for the tile + auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride() / cg_size; - left_table_keep_mask[outer_row_index] = - hash_table_view.contains(outer_row_index, hash_probe, equality); + // Find all the rows in the left table that are in the hash table + for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + outer_row_index < outer_num_rows; + outer_row_index += cg_grid_stride) { + auto const result = set_ref_equality.contains(tile, outer_row_index); + if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; } } } @@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } else { @@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 43714ffb36a..b08298e64e4 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,9 +45,8 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] hash_table_view The hash table built from `build`. + * @param[in] set_ref The hash table device view built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index aa4fa281159..83a55eca50f 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -45,45 +45,6 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -struct make_pair_function_semi { - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // The value is irrelevant since we only ever use the hash map to check for - // membership of a particular row index. - return cuco::make_pair(static_cast(i), 0); - } -}; - -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -class double_row_equality { - public: - double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) - : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} - { - } - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } - - private: - row_equality _equality_comparator; - row_equality _conditional_comparator; -}; - -} // namespace - std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -95,7 +56,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -136,7 +97,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || + cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -155,27 +116,20 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); + cudf::experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - semi_map_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); + // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -190,20 +144,28 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create(right_conditional, stream); + cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); + hash_set_type row_set{ + {compute_hash_table_size(build.num_rows())}, + cuco::empty_key{JoinNoneValue}, + {equality_build_equality, equality_build_conditional}, + {row_hash_build.device_hasher(build_nulls)}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + {stream.value()}}; + + auto iter = thrust::make_counting_iterator(0); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + row_set.insert_async(iter, iter + right_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -211,18 +173,19 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value()); } - auto hash_table_view = hash_table.get_device_view(); - - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; + detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE); + auto const shmem_size_per_block = + parser.shmem_per_thread * + cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); + hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); + // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -231,9 +194,8 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, - hash_probe, equality_probe, - hash_table_view, + row_set_ref, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 6c147c8a128..9041969bec7 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {2, 7, 8}); +} + +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) +{ + using T1 = double; + + // Number of rows in each column + auto constexpr N = 10000; + + // Generate column data for left and right tables + auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns(N, 200); + auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns(N, 100); + + // Setup data and nulls for the left table + std::vector, std::vector>> lefts = { + {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}}; + std::vector> left_wrappers; + std::vector left_columns; + for (auto [data, valids] : lefts) { + left_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + left_columns.emplace_back(left_wrappers.back()); + }; + + // Setup data and nulls for the right table + std::vector, std::vector>> rights = { + {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}}; + std::vector> right_wrappers; + std::vector right_columns; + for (auto [data, valids] : rights) { + right_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + right_columns.emplace_back(left_wrappers.back()); + }; + + // Left and right table views. + auto const left_table = cudf::table_view{left_columns}; + auto const right_table = cudf::table_view{right_columns}; + + // Using the zeroth column for equality. + auto const left_equality = left_table.select({0}); + auto const right_equality = right_table.select({0}); + + // Column references for equality column. + auto const col_ref_left_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + + // Mixed semi join with zeroth column equality + { + // Expected left_semi_join result + auto const expected_mixed_semi_join = + cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero); + + // Actual mixed_left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } + + // Mixed semi join with zeroth column equality and first column GREATER conditional + { + // Column references for conditional column. + auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); + auto left_one_gt_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + // Expected left_semi_join result + auto const expected_mixed_semi_join = cudf::conditional_left_semi_join( + left_table, + right_table, + cudf::ast::operation( + cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one)); + + // Actual left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_one_gt_right_one, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } +} + TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } + +TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {0, 1, 3, 4, 5, 6, 9}); +} From d69e4b6fbdff9ad402a37de7940d64ed16b7d329 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 08:07:48 -1000 Subject: [PATCH 10/26] Respect groupby.nunique(dropna=False) (#16921) closes https://github.com/rapidsai/cudf/issues/16861 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/16921 --- python/cudf/cudf/_lib/aggregation.pyx | 7 +++++-- python/cudf/cudf/core/groupby/groupby.py | 16 ++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 17 +++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7c91533cf93..3c96b90f0a1 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -78,8 +78,11 @@ class Aggregation: ) @classmethod - def nunique(cls): - return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE)) + def nunique(cls, dropna=True): + return cls(pylibcudf.aggregation.nunique( + pylibcudf.types.NullPolicy.EXCLUDE + if dropna else pylibcudf.types.NullPolicy.INCLUDE + )) @classmethod def nth(cls, size): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cb8cd0cd28b..be05075a2cd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2232,6 +2232,22 @@ def func(x): return self.agg(func) + @_performance_tracking + def nunique(self, dropna: bool = True): + """ + Return number of unique elements in the group. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + """ + + def func(x): + return getattr(x, "nunique")(dropna=dropna) + + return self.agg(func) + @_performance_tracking def std( self, diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 848bc259e7b..14ba9894fd3 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1940,6 +1940,23 @@ def test_groupby_nunique(agg, by): assert_groupby_results_equal(expect, got, check_dtype=False) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique_dropna(dropna): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2], + "b": [4, None, 5], + "c": [None, None, 7], + "d": [1, 1, 3], + } + ) + pdf = gdf.to_pandas() + + result = gdf.groupby("a")["b"].nunique(dropna=dropna) + expected = pdf.groupby("a")["b"].nunique(dropna=dropna) + assert_groupby_results_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize( "n", [0, 1, 2, 10], From 742eaadb92b0c5159d92be49e647a17e8c1d0b9b Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 26 Sep 2024 14:27:37 -0500 Subject: [PATCH 11/26] Fix links in Dask cuDF documentation (#16929) More follow-up fixes to the recent Dask-cuDF documentation additions. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16929 --- docs/dask_cudf/source/best_practices.rst | 15 +++++++++------ docs/dask_cudf/source/conf.py | 1 + docs/dask_cudf/source/index.rst | 11 +++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst index 83039f86fed..41263ebf589 100644 --- a/docs/dask_cudf/source/best_practices.rst +++ b/docs/dask_cudf/source/best_practices.rst @@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases, `JIT-unspill `__ is likely to produce better protection from out-of-memory (OOM) errors. Please see `Dask-CUDA's spilling documentation -`__ for further details +`__ for further details and guidance. Use RMM @@ -160,7 +160,7 @@ of the underlying task graph to materialize the collection. :func:`sort_values` / :func:`set_index` : These operations both require Dask to eagerly collect quantile information about the column(s) being targeted by the -global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. +global sort operation. See the next section for notes on sorting considerations. .. note:: When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the @@ -297,11 +297,14 @@ bottleneck is typically device-to-host memory spilling. Although every workflow is different, the following guidelines are often recommended: -* `Use a distributed cluster with Dask-CUDA workers `_ -* `Use native cuDF spilling whenever possible `_ +* Use a distributed cluster with `Dask-CUDA `__ workers + +* Use native cuDF spilling whenever possible (`Dask-CUDA spilling documentation `__) + * Avoid shuffling whenever possible - * Use ``split_out=1`` for low-cardinality groupby aggregations - * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) + * `Use UCX `__ if communication is a bottleneck. .. note:: diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index dc40254312e..5daa8245695 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -78,6 +78,7 @@ "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), "dask": ("https://docs.dask.org/en/stable/", None), "pandas": ("https://pandas.pydata.org/docs/", None), + "dask-cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None), } numpydoc_show_inherited_class_members = True diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 6eb755d7854..c2891ebc15e 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -16,10 +16,9 @@ as the ``"cudf"`` dataframe backend for Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must also deploy a `dask.distributed `__ cluster - to leverage multiple GPUs. We strongly recommend using `Dask-CUDA - `__ to simplify the - setup of the cluster, taking advantage of all features of the GPU - and networking hardware. + to leverage multiple GPUs. We strongly recommend using :doc:`dask-cuda:index` + to simplify the setup of the cluster, taking advantage of all features + of the GPU and networking hardware. If you are familiar with Dask and `pandas `__ or `cuDF `__, then Dask cuDF @@ -161,7 +160,7 @@ out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster. In order to execute your Dask workflow on multiple GPUs, you will -typically need to use `Dask-CUDA `__ +typically need to use :doc:`dask-cuda:index` to deploy distributed Dask cluster, and `Distributed `__ to define a client object. For example:: @@ -192,7 +191,7 @@ to define a client object. For example:: `__ for more details. -Please see the `Dask-CUDA `__ +Please see the :doc:`dask-cuda:index` documentation for more information about deploying GPU-aware clusters (including `best practices `__). From 40075f1115ecd82a74b46d98e80e19afbf8a0210 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 26 Sep 2024 16:32:09 -0400 Subject: [PATCH 12/26] Use `changed-files` shared workflow (#16713) Contributes to https://github.com/rapidsai/build-planning/issues/94 Depends on https://github.com/rapidsai/shared-workflows/pull/239 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16713 --- .github/workflows/pr.yaml | 140 +++++++++++++++----------------------- 1 file changed, 56 insertions(+), 84 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a65cae34653..bc237cc73b0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,80 +43,52 @@ jobs: with: needs: ${{ toJSON(needs) }} changed-files: - runs-on: ubuntu-latest - name: "Check changed files" - outputs: - test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }} - test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }} - test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }} - test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }} - test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }} - steps: - - name: Get PR info - id: get-pr-info - uses: nv-gha-runners/get-pr-info@main - - name: Checkout code repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - persist-credentials: false - - name: Calculate merge base - id: calculate-merge-base - env: - PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }} - run: | - (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT" - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v45 - with: - base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }} - sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - files_yaml: | - cpp: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - java: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - notebooks: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!java/**' - - '!ci/cudf_pandas_scripts/**' - python: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!ci/cudf_pandas_scripts/**' - cudf_pandas: - - '**' - - 'ci/cudf_pandas_scripts/**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + with: + files_yaml: | + test_cpp: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + - '!python/**' + test_cudf_pandas: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + test_java: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + test_notebooks: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!java/**' + test_python: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -139,7 +111,7 @@ jobs: needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_cpp == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-python-build: @@ -152,7 +124,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -161,7 +133,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_other.sh" @@ -169,7 +141,7 @@ jobs: needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 - if: needs.changed-files.outputs.test_java == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -190,7 +162,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 - if: needs.changed-files.outputs.test_notebooks == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -234,7 +206,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cudf.sh @@ -251,7 +223,7 @@ jobs: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -283,7 +255,7 @@ jobs: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -303,7 +275,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -314,7 +286,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) From fa12901024fcc810fcf7f695d2f2e41f472f2306 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:07:48 -0400 Subject: [PATCH 13/26] Fix cudf::strings::findall error with empty input (#16928) Fixes `cudf::strings::findall` error when passed an empty input column. Also adds a gtest for empty input and for all-rows do not match case. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16928 --- cpp/src/strings/search/findall.cu | 10 +++++++--- cpp/tests/strings/findall_tests.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 067a513af96..d8c1b50a94b 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -97,8 +98,11 @@ std::unique_ptr findall(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const strings_count = input.size(); - auto const d_strings = column_device_view::create(input.parent(), stream); + if (input.is_empty()) { + return cudf::lists::detail::make_empty_lists_column(input.parent().type(), stream, mr); + } + + auto const d_strings = column_device_view::create(input.parent(), stream); // create device object from regex_program auto d_prog = regex_device_builder::create_prog_device(prog, stream); @@ -113,7 +117,7 @@ std::unique_ptr findall(strings_column_view const& input, auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr); // Build the lists column from the offsets and the strings - return make_lists_column(strings_count, + return make_lists_column(input.size(), std::move(offsets), std::move(strings_output), input.null_count(), diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 47606b9b3ed..6eea1895fb1 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -148,3 +148,31 @@ TEST_F(StringsFindallTests, LargeRegex) LCW expected({LCW{large_regex.c_str()}, LCW{}, LCW{}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } + +TEST_F(StringsFindallTests, NoMatches) +{ + cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("(^zzz$)"); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{}, LCW{}}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::findall(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindallTests, EmptyTest) +{ + std::string pattern = R"(\w+)"; + + auto prog = cudf::strings::regex_program::create(pattern); + + cudf::test::strings_column_wrapper input; + auto sv = cudf::strings_column_view(input); + auto results = cudf::strings::findall(sv, *prog); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} From 9125d2f19ecd6a82f29cdb41928737ec73eb491b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 26 Sep 2024 20:07:39 -0500 Subject: [PATCH 14/26] reduce wheel build verbosity, narrow deprecation warning filter (#16896) Proposes some small changes I've taken as follow-ups from previous work here. * #16745 filtered out all linter warnings about uses of `datetime.utcnow()` ... this PR limits that to only the warnings observed from `botocore` (so that the linter will helpfully warn us about such uses directly in `cudf`) - ref https://github.com/rapidsai/cudf/pull/16745#discussion_r1746290952 * reduces the verbosity of logs for wheel builds (`-vvv` to `-v`) - similar to https://github.com/rapidsai/cugraph/pull/4651 ## Notes for Reviewers This is intentionally targeted at `24.12`. No need to rush this into 24.10 before code freeze. ### How I tested this
locally in docker (click me) ```shell docker run \ --rm \ --gpus 1 \ -v $(pwd):/opt/work \ -w /opt/work \ -it rapidsai/citestwheel:latest \ bash pip install \ --prefer-binary \ 'cudf-cu12[test]==24.10.*,>=0.0.0a0' \ 'flask' \ 'flask-cors' \ 'moto>=4.0.8' \ 'boto3' \ 's3fs>=2022.3.0' cd ./python/cudf pytest \ cudf/tests/test_s3.py ```
Authors: - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16896 --- ci/build_wheel.sh | 2 +- python/cudf/cudf/tests/pytest.ini | 2 +- python/dask_cudf/pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 7c1fa705faa..bf76f4ed29a 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -12,4 +12,4 @@ rapids-generate-version > ./VERSION cd "${package_dir}" -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index d05ba9aaacc..496a322ff80 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -9,7 +9,7 @@ filterwarnings = ignore:::.*xdist.* ignore:::.*pytest.* # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning + ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c64de06338f..336b2d24948 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -119,7 +119,7 @@ filterwarnings = [ "error::FutureWarning", "error::DeprecationWarning", # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning", + "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning:botocore", "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning", # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", From 0632538a69f55f6d489d306edf2910a111430425 Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Fri, 27 Sep 2024 02:36:13 +0100 Subject: [PATCH 15/26] Use numba-cuda>=0.0.13 (#16474) Testing with https://github.com/NVIDIA/numba-cuda on CI. I am not sure if edits in other repos are required (e.g. I used to have to change an "integration" repo) - any pointers appreciated! Authors: - Graham Markall (https://github.com/gmarkall) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Mike Sarahan (https://github.com/msarahan) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16474 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 6 +++--- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8db03812a19..8b45d26c367 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index fdbe278b66b..354c1360e5a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -53,7 +53,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.0.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e22b4a4eddc..25e69b89789 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.3dev0 - cupy >=12.0.0 - - numba >=0.57 + - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index bb8635403a4..ed36a23e5c3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -605,7 +605,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.57 + - &numba-cuda-dep numba-cuda>=0.0.13 - nvtx>=0.2.1 - packaging - rich @@ -720,7 +720,7 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba==0.57.* + - *numba-cuda-dep - pandas==2.0.* - matrix: packages: @@ -802,7 +802,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - dask-cuda==24.12.*,>=0.0.0a0 - - *numba + - *numba-cuda-dep specific: - output_types: [conda, requirements] matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index f90cb96e189..605f9be5a49 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", - "numba>=0.57", + "numba-cuda>=0.0.13", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 336b2d24948..76e47b50c3b 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==24.12.*,>=0.0.0a0", - "numba>=0.57", + "numba-cuda>=0.0.13", "pytest-cov", "pytest-xdist", "pytest<8", From 51e8a3fd446f7ef061c4a5d9aa7ea45f1ac3bab6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 07:24:41 -0700 Subject: [PATCH 16/26] clang-tidy fixes part 1 (#16937) This PR includes a first set of fixes found by applying the latest version of clang-tidy to our code base. To keep things reviewable, I've restricted this PR to a smaller set of changes just to the includes. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16937 --- .../cudf/column/column_device_view.cuh | 4 +- cpp/include/cudf/column/column_view.hpp | 4 +- .../cudf/detail/aggregation/aggregation.hpp | 4 +- .../cudf/detail/groupby/sort_helper.hpp | 1 - .../cudf/detail/utilities/host_vector.hpp | 2 +- .../dictionary/dictionary_column_view.hpp | 2 +- cpp/include/cudf/groupby.hpp | 2 +- cpp/include/cudf/io/json.hpp | 3 - cpp/include/cudf/lists/lists_column_view.hpp | 2 +- cpp/include/cudf/scalar/scalar.hpp | 6 +- .../cudf/strings/detail/char_tables.hpp | 4 +- .../cudf/strings/regex/regex_program.hpp | 4 +- cpp/include/cudf/strings/string_view.cuh | 13 ++-- .../cudf/strings/strings_column_view.hpp | 2 +- .../cudf/structs/structs_column_view.hpp | 2 +- .../cudf/tdigest/tdigest_column_view.hpp | 2 +- cpp/include/cudf/utilities/span.hpp | 64 +++++++++---------- 17 files changed, 56 insertions(+), 65 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index c3238cb94fd..35a39ef9758 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1425,13 +1425,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 3ef7bafe727..48f89b8be25 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -235,7 +235,7 @@ class column_view_base { * * @return Typed pointer to underlying data */ - virtual void const* get_data() const noexcept { return _data; } + [[nodiscard]] virtual void const* get_data() const noexcept { return _data; } data_type _type{type_id::EMPTY}; ///< Element type size_type _size{}; ///< Number of elements @@ -695,7 +695,7 @@ class mutable_column_view : public detail::column_view_base { * * @return Typed pointer to underlying data */ - void const* get_data() const noexcept override; + [[nodiscard]] void const* get_data() const noexcept override; private: friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4255faea702..6661a461b8b 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -683,7 +683,7 @@ class ewma_aggregation final : public scan_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -694,7 +694,7 @@ class ewma_aggregation final : public scan_aggregation { return collector.visit(col_type, *this); } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index ce8783d8b79..d7a42d0eca5 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -211,7 +211,6 @@ struct sort_groupby_helper { */ column_view keys_bitmask_column(rmm::cuda_stream_view stream); - private: column_ptr _key_sorted_order; ///< Indices to produce _keys in sorted order column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index ecb8f910463..3f6ad7b7b1d 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -183,7 +183,7 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } - bool is_device_accessible() const { return _is_device_accessible; } + [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; } private: rmm::host_async_resource_ref mr; diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index dc822fee38b..5596f78a90b 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -47,7 +47,7 @@ class dictionary_column_view : private column_view { dictionary_column_view(column_view const& dictionary_column); dictionary_column_view(dictionary_column_view&&) = default; ///< Move constructor dictionary_column_view(dictionary_column_view const&) = default; ///< Copy constructor - ~dictionary_column_view() = default; + ~dictionary_column_view() override = default; /** * @brief Move assignment operator diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 11c778408fe..c9df02f167a 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -36,7 +36,7 @@ namespace CUDF_EXPORT cudf { namespace groupby { namespace detail { namespace sort { -class sort_groupby_helper; +struct sort_groupby_helper; } // namespace sort } // namespace detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 6798557e14e..b662b660557 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -116,9 +116,6 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to use the legacy reader - bool _legacy = false; - // Whether to keep the quote characters of string values bool _keep_quotes = false; diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index b117a871b64..d7057cfea7e 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -48,7 +48,7 @@ class lists_column_view : private column_view { lists_column_view(column_view const& lists_column); lists_column_view(lists_column_view&&) = default; ///< Move constructor lists_column_view(lists_column_view const&) = default; ///< Copy constructor - ~lists_column_view() = default; + ~lists_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index e8a498afc09..66be2a12fbe 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf { */ class scalar { public: + scalar() = delete; virtual ~scalar() = default; scalar& operator=(scalar const& other) = delete; scalar& operator=(scalar&& other) = delete; @@ -96,8 +97,6 @@ class scalar { data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar rmm::device_scalar _is_valid; ///< Device bool signifying validity - scalar() = delete; - /** * @brief Move constructor for scalar. * @param other The other scalar to move from. @@ -145,6 +144,7 @@ class fixed_width_scalar : public scalar { public: using value_type = T; ///< Type of the value held by the scalar. + fixed_width_scalar() = delete; ~fixed_width_scalar() override = default; /** @@ -203,8 +203,6 @@ class fixed_width_scalar : public scalar { protected: rmm::device_scalar _data; ///< device memory containing the value - fixed_width_scalar() = delete; - /** * @brief Construct a new fixed width scalar object. * diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 5d6aff28826..6460d4f43ff 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -74,9 +74,9 @@ character_cases_table_type const* get_character_cases_table(); */ struct special_case_mapping { uint16_t num_upper_chars; - uint16_t upper[3]; + uint16_t upper[3]; // NOLINT uint16_t num_lower_chars; - uint16_t lower[3]; + uint16_t lower[3]; // NOLINT }; /** diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 9da859d9c87..1bf1c26f471 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -54,6 +54,8 @@ struct regex_program { regex_flags flags = regex_flags::DEFAULT, capture_groups capture = capture_groups::EXTRACT); + regex_program() = delete; + /** * @brief Move constructor * @@ -115,8 +117,6 @@ struct regex_program { ~regex_program(); private: - regex_program() = delete; - std::string _pattern; regex_flags _flags; capture_groups _capture; diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 14695c3bb27..34ed3c5618e 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -99,7 +99,7 @@ __device__ inline std::pair bytes_to_character_position(st * values. Also, this char pointer serves as valid device pointer of identity * value for minimum operator on string values. */ -static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; +static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT } // namespace detail } // namespace strings @@ -283,14 +283,11 @@ __device__ inline size_type string_view::const_iterator::position() const { retu __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } -__device__ inline string_view::const_iterator string_view::begin() const -{ - return const_iterator(*this, 0, 0); -} +__device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; } __device__ inline string_view::const_iterator string_view::end() const { - return const_iterator(*this, length(), size_bytes()); + return {*this, length(), size_bytes()}; } // @endcond @@ -411,7 +408,7 @@ __device__ inline size_type string_view::find(char const* str, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return find(str, chwidth, pos, count); } @@ -433,7 +430,7 @@ __device__ inline size_type string_view::rfind(char const* str, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return rfind(str, chwidth, pos, count); } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 4a2512eb7c5..6ec8d1238d6 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -45,7 +45,7 @@ class strings_column_view : private column_view { strings_column_view(column_view strings_column); strings_column_view(strings_column_view&&) = default; ///< Move constructor strings_column_view(strings_column_view const&) = default; ///< Copy constructor - ~strings_column_view() = default; + ~strings_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 19798f51656..91d7ddce955 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -42,7 +42,7 @@ class structs_column_view : public column_view { // Foundation members: structs_column_view(structs_column_view const&) = default; ///< Copy constructor structs_column_view(structs_column_view&&) = default; ///< Move constructor - ~structs_column_view() = default; + ~structs_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp index 2f19efa5630..da4954b859c 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -59,7 +59,7 @@ class tdigest_column_view : private column_view { tdigest_column_view(column_view const&); ///< Construct tdigest_column_view from a column_view tdigest_column_view(tdigest_column_view&&) = default; ///< Move constructor tdigest_column_view(tdigest_column_view const&) = default; ///< Copy constructor - ~tdigest_column_view() = default; + ~tdigest_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0daebc0dd8d..914731ea417 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -236,26 +236,26 @@ struct host_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -264,7 +264,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -274,7 +274,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector const& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -285,7 +285,7 @@ struct host_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept : base(other.data(), other.size()) @@ -333,26 +333,26 @@ struct device_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -362,7 +362,7 @@ struct device_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr device_span(device_span const& other) noexcept : base(other.data(), other.size()) From 1f25d7a24c5d58e6c1acdb3d3fbabc6a5a39ebe6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 09:13:43 -0700 Subject: [PATCH 17/26] clang-tidy fixes part 3 (#16939) Subset of improvements to the code base proposed by the latest version of clang-tidy. **Note to reviewers**: The changeset looks deceptively large. Almost all of the change are really just switching from raw C-style arrays to C++ std::arrays. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/16939 --- cpp/tests/copying/copy_tests.cpp | 174 ++++++++-------- cpp/tests/filling/sequence_tests.cpp | 13 +- cpp/tests/groupby/collect_list_tests.cpp | 7 +- cpp/tests/interop/dlpack_test.cpp | 22 +-- cpp/tests/io/orc_test.cpp | 77 ++++---- cpp/tests/io/parquet_chunked_writer_test.cpp | 72 +++---- cpp/tests/io/parquet_common.cpp | 18 +- cpp/tests/io/parquet_misc_test.cpp | 9 +- cpp/tests/io/parquet_reader_test.cpp | 187 +++++++++--------- cpp/tests/io/parquet_v2_test.cpp | 44 +++-- cpp/tests/io/parquet_writer_test.cpp | 88 ++++----- cpp/tests/json/json_tests.cpp | 12 +- cpp/tests/reductions/reduction_tests.cpp | 7 +- cpp/tests/reductions/scan_tests.cpp | 4 +- cpp/tests/rolling/nth_element_test.cpp | 4 +- cpp/tests/streams/transform_test.cpp | 2 +- cpp/tests/strings/chars_types_tests.cpp | 29 +-- cpp/tests/strings/contains_tests.cpp | 69 +++---- cpp/tests/strings/durations_tests.cpp | 143 +++++++------- cpp/tests/strings/extract_tests.cpp | 6 +- cpp/tests/strings/findall_tests.cpp | 6 +- .../integration/unary_transform_test.cpp | 2 +- 22 files changed, 483 insertions(+), 512 deletions(-) diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 7c8729b6a77..4124f749012 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -73,44 +73,45 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong) using T = TypeParam; // make sure we span at least 2 warps - int num_els = 64; - - bool mask[] = {true, false, true, false, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - - bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - lhs_v); - - bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, - rhs_v); - - bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, - 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - exp_v); + constexpr int num_els = 64; + + std::array mask{ + true, false, true, false, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); + + wrapper lhs_w( + {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper rhs_w( + {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {true, true, true, true, true, true, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper expected_w( + {5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -318,19 +319,17 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); auto const rhs = cudf::test::make_type_param_vector({6, 6, 6, 6}); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v.begin()); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), rhs_v); + wrapper expected_w(expected.begin(), expected.end(), rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -340,20 +339,18 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - bool mask_v[] = {true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els, mask_v); + std::array mask{true, false, false, true}; + std::array mask_v{true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto const lhs = cudf::test::make_type_param_vector({5, 5, 5, 5}); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v.begin()); cudf::numeric_scalar rhs_w(6); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 6}); - wrapper expected_w(expected.begin(), expected.end(), lhs_v); + wrapper expected_w(expected.begin(), expected.end(), lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -363,16 +360,14 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); cudf::numeric_scalar rhs_w(6, false); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), mask); + wrapper expected_w(expected.begin(), expected.end(), mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -405,17 +400,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w({6, 6, 6, 6}, rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w({6, 6, 6, 6}, rhs_v.begin()); - wrapper expected_w({5, 6, 6, 5}, rhs_v); + wrapper expected_w({5, 6, 6, 5}, rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -425,17 +418,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w({5, 5, 5, 5}, lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w({5, 5, 5, 5}, lhs_v.begin()); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), true); - wrapper expected_w({5, 6, 6, 5}, lhs_v); + wrapper expected_w({5, 6, 6, 5}, lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -445,15 +436,13 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), false); - wrapper expected_w({5, 6, 6, 5}, mask); + wrapper expected_w({5, 6, 6, 5}, mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -483,9 +472,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -510,9 +499,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, false, true, false, true, false}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, false, true, false, true, false}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -538,8 +527,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(strings2, strings1, mask_w); @@ -565,9 +554,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar) std::vector h_string2{"aaa"}; cudf::string_scalar string2{h_string2[0], false}; - constexpr cudf::size_type mask_size = 6; - bool mask[] = {true, false, true, false, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + mask_size); + std::array mask{true, false, true, false, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(string1, string2, mask_w); @@ -652,9 +640,9 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn) cudf::test::dictionary_column_wrapper input2( h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(input1, input2, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); @@ -679,8 +667,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar) cudf::test::dictionary_column_wrapper input2( h_strings.begin(), h_strings.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(input2, input1, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 5651a26f192..0783b4e5bbb 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -41,8 +41,7 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init, step); @@ -58,8 +57,8 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing) cudf::size_type num_els = 10; - T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w( + {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}); auto result = cudf::sequence(num_els, init, step); @@ -75,8 +74,7 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput) cudf::size_type num_els = 0; - T expected[] = {}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({}); auto result = cudf::sequence(num_els, init, step); @@ -121,8 +119,7 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init); diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 749f4013013..a79b6a32916 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -127,8 +127,9 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) using LCW = cudf::test::lists_column_wrapper; cudf::test::fixed_width_column_wrapper keys{1, 1, 2, 2, 3, 3, 4, 4}; - bool const validity_mask[] = {true, false, false, true, true, true, false, false}; - LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask}; + std::array const validity_mask{true, false, false, true, true, true, false, false}; + LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, + validity_mask.data()}; cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 330f07ac8e2..ef4b9dd9b8a 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -225,8 +225,8 @@ TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack) constexpr int ndim = 1; // Broadcasted (stride-0) 1D tensor auto const data = cudf::test::make_type_param_vector({1}); - int64_t shape[ndim] = {5}; - int64_t strides[ndim] = {0}; + int64_t shape[ndim] = {5}; // NOLINT + int64_t strides[ndim] = {0}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -248,8 +248,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack) constexpr int ndim = 1; // Strided 1D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2}; - int64_t strides[ndim] = {2}; + int64_t shape[ndim] = {2}; // NOLINT + int64_t strides[ndim] = {2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -271,7 +271,7 @@ TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; + int64_t shape[ndim] = {2, 2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -293,8 +293,8 @@ TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor with explicit strides auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 1}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 1}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -316,8 +316,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack) constexpr int ndim = 2; // Column major, but strided in fastest dimension auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4, 5, 6, 7, 8}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 4}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 4}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -465,8 +465,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu) using T = TypeParam; auto const data = cudf::test::make_type_param_vector({0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 0}); uint64_t const offset{sizeof(T)}; - int64_t shape[2] = {4, 2}; - int64_t strides[2] = {1, 5}; + int64_t shape[2] = {4, 2}; // NOLINT + int64_t strides[2] = {1, 5}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 39ba62952b4..89e704f3ed3 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -38,6 +38,7 @@ #include +#include #include template @@ -767,14 +768,14 @@ TEST_F(OrcChunkedWriterTest, Metadata) TEST_F(OrcChunkedWriterTest, Strings) { - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - str_col strings1(h_strings1.begin(), h_strings1.end(), mask1); + str_col strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); table_view tbl1({strings1}); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - str_col strings2(h_strings2.begin(), h_strings2.end(), mask2); + str_col strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -877,26 +878,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els{31}; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -920,26 +921,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -1140,7 +1141,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) } // Test with zstd compressed orc file with high compression ratio. - constexpr uint8_t input_buffer[] = { + constexpr std::array input_buffer{ 0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74, 0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04, 0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb, @@ -1154,7 +1155,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) 0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17}; auto source = - cudf::io::source_info(reinterpret_cast(input_buffer), sizeof(input_buffer)); + cudf::io::source_info(reinterpret_cast(input_buffer.data()), input_buffer.size()); cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(source).use_index(false); diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp index 282c6f3adad..810fee89c48 100644 --- a/cpp/tests/io/parquet_chunked_writer_test.cpp +++ b/cpp/tests/io/parquet_chunked_writer_test.cpp @@ -124,15 +124,15 @@ TEST_F(ParquetChunkedWriterTest, Strings) { std::vector> cols; - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); + cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); cols.push_back(strings1.release()); cudf::table tbl1(std::move(cols)); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); + cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); cols.push_back(strings2.release()); cudf::table tbl2(std::move(cols)); @@ -771,29 +771,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els = 31; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + true, true, true, true, true, true, true, true, true}; + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); @@ -819,29 +819,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 3dd5ad145ea..6141a40bc95 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -483,10 +483,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> ascending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", i); - return std::string(buf); + sprintf(buf.data(), "%09d", i); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -495,10 +495,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> descending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", num_ordered_rows - i); - return std::string(buf); + sprintf(buf.data(), "%09d", static_cast(num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -507,10 +507,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> unordered() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf); + sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 01027d04658..8b03e94191e 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -23,6 +23,8 @@ #include #include +#include + //////////////////////////////// // delta encoding writer tests @@ -225,10 +227,9 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { - cudf::io::parquet::detail::BoundaryOrder::ASCENDING, - cudf::io::parquet::detail::BoundaryOrder::DESCENDING, - cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; + std::array expected_orders{cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 6c61535359f..dc8e68b3a15 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,8 @@ #include #include +#include + TEST_F(ParquetReaderTest, UserBounds) { // trying to read more rows than there are should result in @@ -569,7 +571,8 @@ TEST_F(ParquetReaderTest, DecimalRead) This test is a temporary test until python gains the ability to write decimal, so we're embedding a parquet file directly into the code here to prevent issues with finding the file */ - unsigned char const decimals_parquet[] = { + constexpr unsigned int decimals_parquet_len = 2366; + std::array const decimals_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, @@ -728,10 +731,10 @@ TEST_F(ParquetReaderTest, DecimalRead) 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - unsigned int decimals_parquet_len = 2366; - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(decimals_parquet.data()), decimals_parquet_len}); auto result = cudf::io::read_parquet(read_opts); auto validity = @@ -739,7 +742,7 @@ TEST_F(ParquetReaderTest, DecimalRead) EXPECT_EQ(result.tbl->view().num_columns(), 3); - int32_t col0_data[] = { + std::array col0_data{ -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, @@ -753,29 +756,28 @@ TEST_F(ParquetReaderTest, DecimalRead) std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, - 68073792696254, 2236456014294, 13704555677045, - -70797090469548, -52248605513407, -68976081919961, - -34277313883112, 97774730521689, 21184241014572, - -670882460254, -40862944054399, -24079852370612, - -88670167797498, -84007574359403, -71843004533519, - -55538016554201, 3491435293032, -29085437167297, - 36901882672273, -98622066122568, -13974902998457, - 86712597643378, -16835133643735, -94759096142232, - 30708340810940, 79086853262082, 78923696440892, - -76316597208589, 37247268714759, 80303592631774, - 57790350050889, 19387319851064, -33186875066145, - 69701203023404, -7157433049060, -7073790423437, - 92769171617714, -75127120182184, -951893180618, - 64927618310150, -53875897154023, -16168039035569, - -24273449166429, -30359781249192, 35639397345991, - 45844829680593, 71401416837149, 0, - -99999999999999, 99999999999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + std::array col1_data{29274040266581, -17210335917753, -58420730139037, + 68073792696254, 2236456014294, 13704555677045, + -70797090469548, -52248605513407, -68976081919961, + -34277313883112, 97774730521689, 21184241014572, + -670882460254, -40862944054399, -24079852370612, + -88670167797498, -84007574359403, -71843004533519, + -55538016554201, 3491435293032, -29085437167297, + 36901882672273, -98622066122568, -13974902998457, + 86712597643378, -16835133643735, -94759096142232, + 30708340810940, 79086853262082, 78923696440892, + -76316597208589, 37247268714759, 80303592631774, + 57790350050889, 19387319851064, -33186875066145, + 69701203023404, -7157433049060, -7073790423437, + 92769171617714, -75127120182184, -951893180618, + 64927618310150, -53875897154023, -16168039035569, + -24273449166429, -30359781249192, 35639397345991, + 45844829680593, 71401416837149, 0, + -99999999999999, 99999999999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); + col1_data.begin(), col1_data.end(), validity, numeric::scale_type{-5}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); cudf::io::parquet_reader_options read_strict_opts = read_opts; @@ -786,7 +788,7 @@ TEST_F(ParquetReaderTest, DecimalRead) // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) - unsigned char const fixed_len_bytes_decimal_parquet[] = { + std::array const fixed_len_bytes_decimal_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, @@ -875,75 +877,72 @@ TEST_F(ParquetReaderTest, DecimalRead) cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ - reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); + reinterpret_cast(fixed_len_bytes_decimal_parquet.data()), parquet_len}); auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().num_columns(), 3); - auto validity_c0 = cudf::test::iterators::nulls_at({19}); - int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, - 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, - 2282579, 7521455, 4430706, 1937859, 4532040, 0}; + auto validity_c0 = cudf::test::iterators::nulls_at({19}); + std::array col0_data{6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, + 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, + 2282579, 7521455, 4430706, 1937859, 4532040, 0}; - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), col0_data.size()); cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); + col0_data.begin(), col0_data.end(), validity_c0, numeric::scale_type{-3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - auto validity_c1 = cudf::test::iterators::nulls_at({18}); - int64_t col1_data[] = {361378026250, - 30646804862, - 429930238629, - 418758703536, - 895494171113, - 435283865083, - 809096053722, - -999999999999, - 426465099333, - 526684574144, - 826310892810, - 584686967589, - 113822282951, - 409236212092, - 420631167535, - 918438386086, - -999999999999, - 489053889147, - 0, - 363993164092}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + auto validity_c1 = cudf::test::iterators::nulls_at({18}); + std::array col1_data{361378026250, + 30646804862, + 429930238629, + 418758703536, + 895494171113, + 435283865083, + 809096053722, + -999999999999, + 426465099333, + 526684574144, + 826310892810, + 584686967589, + 113822282951, + 409236212092, + 420631167535, + 918438386086, + -999999999999, + 489053889147, + 0, + 363993164092}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); + col1_data.begin(), col1_data.end(), validity_c1, numeric::scale_type{-11}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); - __int128_t col2_data[] = {9078697037144433659, - 9050770539577117612, - 2358363961733893636, - 1566059559232276662, - 6658306200002735268, - 4967909073046397334, - 0, - 7235588493887532473, - 5023160741463849572, - 2765173712965988273, - 3880866513515749646, - 5019704400576359500, - 5544435986818825655, - 7265381725809874549, - 0, - 1576192427381240677, - 2828305195087094598, - 260308667809395171, - 2460080200895288476, - 2718441925197820439}; - - EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), - sizeof(col2_data) / sizeof(col2_data[0])); + auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); + std::array<__int128_t, 20> col2_data{9078697037144433659, + 9050770539577117612, + 2358363961733893636, + 1566059559232276662, + 6658306200002735268, + 4967909073046397334, + 0, + 7235588493887532473, + 5023160741463849572, + 2765173712965988273, + 3880866513515749646, + 5019704400576359500, + 5544435986818825655, + 7265381725809874549, + 0, + 1576192427381240677, + 2828305195087094598, + 260308667809395171, + 2460080200895288476, + 2718441925197820439}; + + EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), col2_data.size()); cudf::test::fixed_point_column_wrapper<__int128_t> col2( - std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); + col2_data.begin(), col2_data.end(), validity_c2, numeric::scale_type{-1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); } } @@ -1221,7 +1220,7 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) TEST_F(ParquetReaderTest, SingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1239,7 +1238,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) // read single level list reproducing parquet file cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()}); auto table = cudf::io::read_parquet(read_opts); auto const c0 = table.tbl->get_column(0); @@ -1252,7 +1251,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1271,7 +1270,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) auto reader = cudf::io::chunked_parquet_reader( 1L << 31, cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()})); int iterations = 0; while (reader.has_next() && iterations < 10) { auto chunk = reader.read_chunk(); @@ -1932,7 +1931,7 @@ TEST_F(ParquetReaderTest, FilterFloatNAN) TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { - constexpr unsigned char repeated_bytes[] = { + constexpr std::array repeated_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, @@ -1976,9 +1975,9 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations) 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - auto read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); - auto result = cudf::io::read_parquet(read_opts); + auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(repeated_bytes.data()), repeated_bytes.size()}); + auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().column(0).size(), 6); EXPECT_EQ(result.tbl->view().num_columns(), 2); diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 9e66fc9409f..7c305235ea6 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -23,6 +23,8 @@ #include +#include + using cudf::test::iterators::no_nulls; // Base test fixture for V2 header tests @@ -693,9 +695,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -715,9 +717,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -787,9 +789,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -819,9 +821,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); @@ -897,9 +899,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -914,9 +916,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -1034,7 +1036,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) // hard coded schema indices. // TODO find a way to do this without magic - size_t const colidxs[] = {1, 3, 4, 5, 8}; + constexpr std::array colidxs{1, 3, 4, 5, 8}; for (size_t r = 0; r < fmd.row_groups.size(); r++) { auto const& rg = fmd.row_groups[r]; for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1129,7 +1131,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) // col1 will have num_ordered_rows / 2 nulls total // col2 will have num_ordered_rows / 3 nulls total // col3 will have num_ordered_rows / 4 nulls total - int const null_mods[] = {0, 2, 3, 4}; + constexpr std::array null_mods{0, 2, 3, 4}; for (auto const& rg : fmd.row_groups) { for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1299,7 +1301,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); - int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; + std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; std::vector const expected_def_hists[] = {{1, 1, 2, 3}, {1, 3, 10}, {1, 1, 2, 10}, diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index c8100038942..8794f2ee304 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -31,6 +31,7 @@ #include #include +#include #include using cudf::test::iterators::no_nulls; @@ -879,53 +880,52 @@ TEST_F(ParquetWriterTest, Decimal128Stats) TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) { - char const* coldata[] = { - // in-range 7 bit. should truncate to "yyyyyyyz" - "yyyyyyyyy", - // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's - // considered binary, not UTF-8. If UTF-8 it should not truncate. - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - // max binary. this should not truncate - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" - "ééééé", - // max 2-byte UTF8 (U+07FF). should not truncate - "߿߿߿߿߿", - // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" - "ࠀࠀࠀ", - // max 3-byte UTF8 (U+FFFF). should not truncate - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" - "𐀀𐀀𐀀", - // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, - // which is no longer valid unicode, but is still ok UTF-8??? - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - // max 4-byte UTF8 (U+1FFFFF). should not truncate - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array coldata{// in-range 7 bit. should truncate to "yyyyyyyz" + "yyyyyyyyy", + // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's + // considered binary, not UTF-8. If UTF-8 it should not truncate. + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + // max binary. this should not truncate + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" + "ééééé", + // max 2-byte UTF8 (U+07FF). should not truncate + "߿߿߿߿߿", + // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" + "ࠀࠀࠀ", + // max 3-byte UTF8 (U+FFFF). should not truncate + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" + "𐀀𐀀𐀀", + // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, + // which is no longer valid unicode, but is still ok UTF-8??? + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + // max 4-byte UTF8 (U+1FFFFF). should not truncate + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger // than that will not become minimum value (when written as UTF-8). - char const* truncated_min[] = {"yyyyyyyy", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - "\xf7\xbf\xbf\xbf", - "éééé", - "߿߿߿߿", - "ࠀࠀ", - "\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀀", - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - "\xf7\xbf\xbf\xbf"}; - - char const* truncated_max[] = {"yyyyyyyz", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - "éééê", - "߿߿߿߿߿", - "ࠀࠁ", - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀁", - "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array truncated_min{"yyyyyyyy", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + "\xf7\xbf\xbf\xbf", + "éééé", + "߿߿߿߿", + "ࠀࠀ", + "\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀀", + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + "\xf7\xbf\xbf\xbf"}; + + std::array truncated_max{"yyyyyyyz", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "éééê", + "߿߿߿߿߿", + "ࠀࠁ", + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀁", + "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; auto cols = [&]() { using string_wrapper = column_wrapper; diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index a9186874e83..42a574ac5c0 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -652,7 +652,7 @@ TEST_F(JsonPathTests, MixedOutput) // various queries on: // clang-format off std::vector input_strings { - "{\"a\": {\"b\" : \"c\"}}", + R"({"a": {"b" : "c"}})", "{" "\"a\": {\"b\" : \"c\"}," @@ -827,7 +827,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes) // various queries on: std::vector input_strings{ // clang-format off - "{\'a\': {\'b\' : \'c\'}}", + R"({'a': {'b' : 'c'}})", "{" "\'a\': {\'b\' : \"c\"}," @@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"item\" : [{\"key\" : \"value[\"}]}", + R"({"item" : [{"key" : "value["}]})", // clang-format on }; @@ -927,7 +927,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}", + R"({"a" : "[}{}][][{[\"}}[\"]"})", // clang-format on }; @@ -958,8 +958,8 @@ TEST_F(JsonPathTests, EscapeSequences) std::vector input_strings{ // clang-format off - "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}", - "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}" + R"({"a" : "\" \\ \/ \b \f \n \r \t"})", + R"({"a" : "\u1248 \uacdf \uACDF \u10EF"})" // clang-format on }; diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 949ffcc26a6..1e9e13ded93 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -35,7 +35,6 @@ #include -#include #include using aggregation = cudf::aggregation; @@ -1254,7 +1253,7 @@ struct StringReductionTest : public cudf::test::BaseFixture, }; // ------------------------------------------------------------------------ -std::vector string_list[] = { +std::vector> string_list{{ {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"one", "", "three", "four", "five", "six", "seven", "eight", "nine"}, @@ -1264,7 +1263,7 @@ std::vector string_list[] = { {"\xF7\xBF\xBF\xBF", "", "", "", "", "", "", "", ""}, {"one", "two", "three", "four", "\xF7\xBF\xBF\xBF", "six", "seven", "eight", "nine"}, {"one", "two", "\xF7\xBF\xBF\xBF", "four", "five", "six", "seven", "eight", "nine"}, -}; +}}; INSTANTIATE_TEST_CASE_P(string_cases, StringReductionTest, testing::ValuesIn(string_list)); TEST_P(StringReductionTest, MinMax) { @@ -2235,7 +2234,7 @@ TYPED_TEST(ReductionTest, NthElement) struct DictionaryStringReductionTest : public StringReductionTest {}; -std::vector data_list[] = { +std::vector> data_list = { {"nine", "two", "five", "three", "five", "six", "two", "eight", "nine"}, }; INSTANTIATE_TEST_CASE_P(dictionary_cases, diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 76dbbaef491..c4463d68a68 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -415,8 +415,8 @@ TEST_F(ScanStringsTest, MoreStringsMinMax) int row_count = 512; auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) { - char const s[] = {static_cast('a' + (idx % 26)), 0}; - return std::string(s); + char const s = static_cast('a' + (idx % 26)); + return std::string{1, s}; }); auto validity = cudf::detail::make_counting_transform_iterator( 0, [](auto idx) -> bool { return (idx % 23) != 22; }); diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 9cc8b6dec81..2444992e68f 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -83,7 +83,7 @@ class rolling_exec { return *this; } - std::unique_ptr test_grouped_nth_element( + [[nodiscard]] std::unique_ptr test_grouped_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::grouped_rolling_window( @@ -96,7 +96,7 @@ class rolling_exec { n, null_handling.value_or(_null_handling))); } - std::unique_ptr test_nth_element( + [[nodiscard]] std::unique_ptr test_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::rolling_window(_input, diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index 9187672221c..cf81dc6fb42 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -32,7 +32,7 @@ class TransformTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 7e530b2a34d..5923f8dee5a 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsCharsTest : public cudf::test::BaseFixture {}; @@ -50,20 +51,20 @@ TEST_P(CharsTypes, AllTypes) "de", "\t\r\n\f "}; - bool expecteds[] = {false, false, false, false, false, false, false, false, - false, false, false, false, false, true, false, false, // decimal - false, false, false, false, false, false, false, false, - false, true, false, true, false, true, false, false, // numeric - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, // digit - true, true, false, true, false, false, false, false, - false, false, false, false, false, false, true, false, // alpha - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, true, // space - false, false, false, true, false, false, false, false, - false, false, false, false, false, false, false, false, // upper - false, true, false, false, false, false, false, false, - false, false, false, false, false, false, true, false}; // lower + std::array expecteds{false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, false, false, false, + false, true, false, true, false, true, false, false, // numeric + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, // digit + true, true, false, true, false, false, false, false, + false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, true, // space + false, false, false, true, false, false, false, false, + false, false, false, false, false, false, false, false, // upper + false, true, false, false, false, false, false, false, + false, false, false, false, false, false, true, false}; // lower auto is_parm = GetParam(); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index acf850c7a66..bdfd38267e6 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -32,6 +32,7 @@ #include #include +#include #include struct StringsContainsTests : public cudf::test::BaseFixture {}; @@ -167,10 +168,8 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { auto const pattern = std::string("lazy"); - bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, true, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -178,10 +177,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("\\d+"); - bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, true, true, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -189,10 +186,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("@\\w+"); - bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -200,10 +195,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string(".*"); - bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {true, true, true, true, true, false, true}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -335,9 +328,9 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); std::generate(data.begin(), data.end(), [n = 0]() mutable { - char first = static_cast('A' + n++); - char raw_data[] = {first, '\0', 'B'}; - return std::string{raw_data, 3}; + char first = static_cast('A' + n++); + std::array raw_data = {first, '\0', 'B'}; + return std::string{raw_data.data(), 3}; }); cudf::test::strings_column_wrapper input(data.begin(), data.end()); auto strings_view = cudf::strings_column_view(input); @@ -749,11 +742,11 @@ TEST_F(StringsContainsTests, ASCII) auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé  ❽❽", "aZ ❽4", "XYZ 8"}); auto view = cudf::strings_column_view(input); - std::string patterns[] = {R"(\w+[\s]+\d+)", - R"([^\W]+\s+[^\D]+)", - R"([\w]+[^\S]+[\d]+)", - R"([\w]+\s+[\d]+)", - R"(\w+\s+\d+)"}; + std::array patterns = {R"(\w+[\s]+\d+)", + R"([^\W]+\s+[^\D]+)", + R"([\w]+[^\S]+[\d]+)", + R"([\w]+\s+[\d]+)", + R"(\w+\s+\d+)"}; for (auto ptn : patterns) { auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); @@ -787,24 +780,18 @@ TEST_F(StringsContainsTests, MediumRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } @@ -828,24 +815,18 @@ TEST_F(StringsContainsTests, LargeRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp index 86189b29981..f2e31339035 100644 --- a/cpp/tests/strings/durations_tests.cpp +++ b/cpp/tests/strings/durations_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsDurationsTest : public cudf::test::BaseFixture {}; @@ -403,17 +404,17 @@ TEST_F(StringsDurationsTest, ParseSingle) "01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; - auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 3600}; }); + std::array expected_v{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; + auto it1 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 3600}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 60}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 60}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -421,14 +422,14 @@ TEST_F(StringsDurationsTest, ParseSingle) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s2); auto it3 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s3(it3, it3 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s3); - auto it4 = thrust::make_transform_iterator(expected_v, + auto it4 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i * 60000}; }); cudf::test::fixed_width_column_wrapper expected_ms(it4, it4 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -454,21 +455,21 @@ TEST_F(StringsDurationsTest, ParseMultiple) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0, - -1, - -(3600 + 60 + 1), - 23 * 3600 + 1, - -(23 * 3600 + 1), - 59 * 3600, - -59 * 3600, - 99 * 3600, - -99 * 3600, - 0, - 3661, - 0}; + std::array expected_v{0, + 0, + -1, + -(3600 + 60 + 1), + 23 * 3600 + 1, + -(23 * 3600 + 1), + 59 * 3600, + -59 * 3600, + 99 * 3600, + -99 * 3600, + 0, + 3661, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -476,7 +477,7 @@ TEST_F(StringsDurationsTest, ParseMultiple) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -508,28 +509,28 @@ TEST_F(StringsDurationsTest, ParseSubsecond) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int64_t expected_v[]{0, - -123456789L, - -1000666999L, - -((3600 + 60 + 1) * 1000000000L + 100000000L), - (23 * 3600 + 1) * 1000000000L + 80L, - -((23 * 3600 + 1) * 1000000000L + 123000000L), - (59 * 3600) * 1000000000L, - -(59 * 3600) * 1000000000L, - (99 * 3600) * 1000000000L, - -(99 * 3600) * 1000000000L, - 0, - (3661) * 1000000000L, - 0}; + std::array expected_v{0, + -123456789L, + -1000666999L, + -((3600 + 60 + 1) * 1000000000L + 100000000L), + (23 * 3600 + 1) * 1000000000L + 80L, + -((23 * 3600 + 1) * 1000000000L + 123000000L), + (59 * 3600) * 1000000000L, + -(59 * 3600) * 1000000000L, + (99 * 3600) * 1000000000L, + -(99 * 3600) * 1000000000L, + 0, + (3661) * 1000000000L, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ns{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ns{i}; }); cudf::test::fixed_width_column_wrapper expected_ns1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H:%M:%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns1); - auto it2 = thrust::make_transform_iterator(expected_v, + auto it2 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i / 1000000}; }); cudf::test::fixed_width_column_wrapper expected_ms2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -559,25 +560,25 @@ TEST_F(StringsDurationsTest, ParseAMPM) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 0, - 0 - 12 * 3600, - -1, - -1 - 12 * 3600, - -(3600 + 60 + 1), - -(3600 + 60 + 1) - 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - -(11 * 3600 + 59 * 60 + 59), - -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), - 0, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 0, + 0 - 12 * 3600, + -1, + -1 - 12 * 3600, + -(3600 + 60 + 1), + -(3600 + 60 + 1) - 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + -(11 * 3600 + 59 * 60 + 59), + -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), + 0, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -585,7 +586,7 @@ TEST_F(StringsDurationsTest, ParseAMPM) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -616,20 +617,20 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 1, - 1 + 12 * 3600, - (3600 + 60 + 1), - (3600 + 60 + 1) + 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 1, + 1 + 12 * 3600, + (3600 + 60 + 1), + (3600 + 60 + 1) + 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -641,8 +642,8 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "%OI:%OM:%OS %p"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ms{i * 1000}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_ms{i * 1000}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 1491da758d5..61246fb098d 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -275,8 +275,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto pattern = std::string("(\\d+) (\\w+)"); - bool valids[] = {true, true, true, false, false, false, true}; - using LCW = cudf::test::lists_column_wrapper; + std::array valids{true, true, true, false, false, false, true}; + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"123", "banana", "7", "eleven"}, LCW{"41", "apple"}, LCW{"6", "péar", "0", "pair"}, @@ -284,7 +284,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{}, LCW{"4", "paré"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::extract_all_record(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 6eea1895fb1..73da4d081e2 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -33,10 +33,10 @@ struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) { - bool valids[] = {true, true, true, true, true, false, true, true}; + std::array valids{true, true, true, true, true, false, true, true}; cudf::test::strings_column_wrapper input( {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, - valids); + valids.data()); auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+)-(\\w+)"); @@ -50,7 +50,7 @@ TEST_F(StringsFindallTests, FindallTest) LCW{}, LCW{}, LCW{"25-9000"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::findall(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 5fa02d9978a..1785848ec77 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -30,7 +30,7 @@ namespace transformation { struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); From 4018d3116b2bfd876253b187894df10cb325db2f Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 27 Sep 2024 13:17:03 -0400 Subject: [PATCH 18/26] Remove superfluous use of std::vector for std::future (#16829) This PR addresses #16888 , where a superfluous use of `std::vector` should be removed. closes #16888 Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16829 --- cpp/src/io/parquet/reader_impl.hpp | 4 +-- cpp/src/io/parquet/reader_impl_preprocess.cu | 26 +++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 2d46da14bec..62ffc4d3077 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -188,10 +188,10 @@ class reader::impl { * * Does not decompress the chunk data. * - * @return pair of boolean indicating if compressed chunks were found and a vector of futures for + * @return pair of boolean indicating if compressed chunks were found and a future for * read completion */ - std::pair>> read_column_chunks(); + std::pair> read_column_chunks(); /** * @brief Read compressed data and page information for the current pass. diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8e67f233213..3763c2e8e6d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -964,7 +964,7 @@ void reader::impl::allocate_level_decode_space() } } -std::pair>> reader::impl::read_column_chunks() +std::pair> reader::impl::read_column_chunks() { auto const& row_groups_info = _pass_itm_data->row_groups; @@ -989,7 +989,6 @@ std::pair>> reader::impl::read_column_chunks // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide // skip_rows/num_rows // auto remaining_rows = num_rows; - std::vector> read_chunk_tasks; size_type chunk_count = 0; for (auto const& rg : row_groups_info) { auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); @@ -1018,16 +1017,15 @@ std::pair>> reader::impl::read_column_chunks } // Read compressed chunk data to device memory - read_chunk_tasks.push_back(read_column_chunks_async(_sources, - raw_page_data, - chunks, - 0, - chunks.size(), - column_chunk_offsets, - chunk_source_map, - _stream)); - - return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; + return {total_decompressed_size > 0, + read_column_chunks_async(_sources, + raw_page_data, + chunks, + 0, + chunks.size(), + column_chunk_offsets, + chunk_source_map, + _stream)}; } void reader::impl::read_compressed_data() @@ -1042,9 +1040,7 @@ void reader::impl::read_compressed_data() auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks(); pass.has_compressed_data = has_compressed_data; - for (auto& task : read_chunks_tasks) { - task.wait(); - } + read_chunks_tasks.wait(); // Process dataset chunk pages into output columns auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream) From afe9f929abf565c235d5a4e375ef33f2cf032487 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 10:55:48 -0700 Subject: [PATCH 19/26] clang-tidy fixes part 2 (#16938) Subset of improvements to the code base proposed by the latest version of clang-tidy. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16938 --- cpp/src/datetime/timezone.cpp | 2 +- cpp/src/dictionary/dictionary_column_view.cpp | 5 ++--- cpp/src/interop/dlpack.cpp | 4 ++-- cpp/src/io/avro/avro.cpp | 2 +- cpp/src/io/avro/avro.hpp | 21 ++++++++++--------- cpp/src/io/comp/uncomp.cpp | 4 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 21 ++++++++++--------- cpp/src/jit/parser.cpp | 4 +--- cpp/src/strings/regex/regcomp.cpp | 4 ++-- cpp/src/utilities/stream_pool.cpp | 2 +- 10 files changed, 34 insertions(+), 35 deletions(-) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index cf239297255..a6b6cbbf0b5 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -38,7 +38,7 @@ std::string const tzif_system_directory = "/usr/share/zoneinfo/"; struct timezone_file_header { uint32_t magic; ///< "TZif" uint8_t version; ///< 0:version1, '2':version2, '3':version3 - uint8_t reserved15[15]; ///< unused, reserved for future use + uint8_t reserved15[15]; ///< unused, reserved for future use // NOLINT uint32_t isutccnt; ///< number of UTC/local indicators contained in the body uint32_t isstdcnt; ///< number of standard/wall indicators contained in the body uint32_t leapcnt; ///< number of leap second records contained in the body diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index 4906e5b4f9c..3e4a201bba4 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,7 @@ column_view dictionary_column_view::indices() const noexcept { return child(0); column_view dictionary_column_view::get_indices_annotated() const noexcept { - return column_view( - indices().type(), size(), indices().head(), null_mask(), null_count(), offset()); + return {indices().type(), size(), indices().head(), null_mask(), null_count(), offset()}; } column_view dictionary_column_view::keys() const noexcept { return child(1); } diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index ba5b11b90d8..a1be6aade4e 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -118,8 +118,8 @@ DLDataType data_type_to_DLDataType(data_type type) // Context object to own memory allocated for DLManagedTensor struct dltensor_context { - int64_t shape[2]; - int64_t strides[2]; + int64_t shape[2]; // NOLINT + int64_t strides[2]; // NOLINT rmm::device_buffer buffer; static void deleter(DLManagedTensor* arg) diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 2041f03cd81..03cf6d4a0e0 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -199,7 +199,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) // Read the next sync markers and ensure they match the first ones we // encountered. If they don't, we have to assume the data is corrupted, // and thus, we terminate processing immediately. - uint64_t const sync_marker[] = {get_raw(), get_raw()}; + std::array const sync_marker = {get_raw(), get_raw()}; bool valid_sync_markers = ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1])); if (!valid_sync_markers) { return false; } diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index f2813a1ba51..2e992546ccc 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "avro_common.hpp" #include +#include #include #include #include @@ -100,15 +101,15 @@ struct column_desc { */ struct file_metadata { std::map user_data; - std::string codec = ""; - uint64_t sync_marker[2] = {0, 0}; - size_t metadata_size = 0; - size_t total_data_size = 0; - size_t selected_data_size = 0; - size_type num_rows = 0; - size_type skip_rows = 0; - size_type total_num_rows = 0; - uint32_t max_block_size = 0; + std::string codec = ""; + std::array sync_marker = {0, 0}; + size_t metadata_size = 0; + size_t total_data_size = 0; + size_t selected_data_size = 0; + size_type num_rows = 0; + size_type skip_rows = 0; + size_type total_num_rows = 0; + uint32_t max_block_size = 0; std::vector schema; std::vector block_list; std::vector columns; diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 602ff1734b6..1af45b41d8e 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -42,7 +42,7 @@ struct gz_file_header_s { uint8_t id2; // 0x8b uint8_t comp_mthd; // compression method (0-7=reserved, 8=deflate) uint8_t flags; // flags (GZIPHeaderFlag) - uint8_t mtime[4]; // If non-zero: modification time (Unix format) + uint8_t mtime[4]; // If non-zero: modification time (Unix format) // NOLINT uint8_t xflags; // Extra compressor-specific flags uint8_t os; // OS id }; @@ -103,7 +103,7 @@ struct zip_lfh_s { }; struct bz2_file_header_s { - uint8_t sig[3]; // "BZh" + uint8_t sig[3]; // "BZh" // NOLINT uint8_t blksz; // block size 1..9 in 100kB units (post-RLE) }; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1390339c1ae..e631e12119d 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -294,7 +294,8 @@ struct PageInfo { int32_t uncompressed_page_size; // uncompressed data size in bytes // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length // indicator. instead the lengths for these are stored in the header. - int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) + int32_t // NOLINT + lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) // Number of values in this data page or dictionary. // Important : the # of input values does not necessarily // correspond to the number of rows in the output. It just reflects the number @@ -345,7 +346,7 @@ struct PageInfo { PageNestingDecodeInfo* nesting_decode; // level decode buffers - uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; + uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; // NOLINT // temporary space for decoding DELTA_BYTE_ARRAY encoded strings int64_t temp_string_size; @@ -431,14 +432,14 @@ struct ColumnChunkDesc { size_t num_values{}; // total number of values in this column size_t start_row{}; // file-wide, absolute starting row of this chunk uint32_t num_rows{}; // number of rows in this chunk - int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level - int16_t max_nesting_depth{}; // max nesting depth of the output - int32_t type_length{}; // type length from schema (for FLBA only) - Type physical_type{}; // parquet physical data type - uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages + int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level // NOLINT + int16_t max_nesting_depth{}; // max nesting depth of the output + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type + uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max // NOLINT + // definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; string_index_pair* str_dict_index{}; // index for string dictionary bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 398c36821cc..519ac2d1a2e 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -19,8 +19,6 @@ #include #include -#include -#include #include #include #include @@ -28,7 +26,7 @@ namespace cudf { namespace jit { -constexpr char percent_escape[] = "_"; +constexpr char percent_escape[] = "_"; // NOLINT inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7c4c89bd3fb..51c6e765edd 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -35,7 +35,7 @@ namespace strings { namespace detail { namespace { // Bitmask of all operators -#define OPERATOR_MASK 0200 +enum { OPERATOR_MASK = 0200 }; enum OperatorType : int32_t { START = 0200, // Start, used for marker on stack LBRA_NC = 0203, // non-capturing group @@ -50,7 +50,7 @@ enum OperatorType : int32_t { COUNTED_LAZY = 0215, NOP = 0302, // No operation, internal use only }; -#define ITEM_MASK 0300 +enum { ITEM_MASK = 0300 }; static reclass cclass_w(CCLASS_W); // \w static reclass cclass_s(CCLASS_S); // \s diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9824c472b20..8c29182bfb5 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -82,7 +82,7 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { return streams; } - std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } + [[nodiscard]] std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } }; /** From 670cc3f9c6add1fddde142ec3dece65643d3f022 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 27 Sep 2024 08:43:53 -1000 Subject: [PATCH 20/26] Avoid public constructors when called with columns to avoid unnecessary validation (#16747) This PR continues an effort to avoid some public constructors when passing a column(s) to avoid unnecessary validation Maybe we should consider disallowing public constructors to accept columns all-together, but I suspect some RAPIDS libraries are passing columns to public constructors Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16747 --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/dataframe.py | 44 +++++++++------------ python/cudf/cudf/core/multiindex.py | 26 +++++------- python/cudf/cudf/core/reshape.py | 29 +++++++------- python/cudf/cudf/core/window/ewm.py | 33 ++++++++-------- python/cudf/cudf/core/window/rolling.py | 39 +++++++++--------- 6 files changed, 77 insertions(+), 96 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de5ed15771d..864e87b5377 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1337,7 +1337,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): - new_cats = cudf.Series(new_cats)._column.unique() + new_cats = new_cats.unique() if cur_cats.equals(new_cats, check_dtypes=True): # TODO: Internal usages don't always need a copy; add a copy keyword diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16b0aa95c35..79ed5a0e187 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6287,14 +6287,17 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): ) if not skipna and any(col.nullable for col in filtered._columns): - mask = DataFrame( + length = filtered._data.nrows + ca = ColumnAccessor( { - name: filtered._data[name]._get_mask_as_column() - if filtered._data[name].nullable - else as_column(True, length=len(filtered._data[name])) - for name in filtered._column_names - } + name: col._get_mask_as_column() + if col.nullable + else as_column(True, length=length) + for name, col in filtered._data.items() + }, + verify=False, ) + mask = DataFrame._from_data(ca) mask = mask.all(axis=1) else: mask = None @@ -6679,19 +6682,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) return Series._from_column(result, index=self.index) else: - result_df = DataFrame(result).set_index(self.index) + result_df = DataFrame(result, index=self.index) result_df._set_columns_like(prepared._data) return result_df - @_performance_tracking - def _columns_view(self, columns): - """ - Return a subset of the DataFrame's columns as a view. - """ - return DataFrame( - {col: self._data[col] for col in columns}, index=self.index - ) - @_performance_tracking def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame's columns based on the column dtypes. @@ -6763,8 +6757,6 @@ def select_dtypes(self, include=None, exclude=None): if not isinstance(exclude, (list, tuple)): exclude = (exclude,) if exclude is not None else () - df = DataFrame(index=self.index) - # cudf_dtype_from_pydata_dtype can distinguish between # np.float and np.number selection = tuple(map(frozenset, (include, exclude))) @@ -6820,12 +6812,12 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._column_labels_and_values: - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) - if infered_type in inclusion: - df._insert(len(df._data), k, col) - - return df + to_select = [ + label + for label, dtype in self._dtypes + if cudf_dtype_from_pydata_dtype(dtype) in inclusion + ] + return self.loc[:, to_select] @ioutils.doc_to_parquet() def to_parquet( @@ -7331,7 +7323,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): cov = cupy.cov(self.values, ddof=ddof, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) + df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7374,7 +7366,7 @@ def corr( corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) + df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6de3981ba66..92d094d9de5 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -700,7 +700,10 @@ def _compute_validity_mask(self, index, row_tuple, max_length): lookup_dict[i] = row lookup = cudf.DataFrame(lookup_dict) frame = cudf.DataFrame._from_data( - ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ColumnAccessor( + dict(enumerate(index._columns)), + verify=False, + ) ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) @@ -780,18 +783,12 @@ def _index_and_downcast(self, result, index, index_key): index_key = index_key[0] slice_access = isinstance(index_key, slice) - out_index = cudf.DataFrame() - # Select the last n-k columns where n is the number of columns and k is + # Count the last n-k columns where n is the number of columns and k is # the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) - for k in range(size, len(index._data)): - out_index.insert( - out_index._num_columns, - k, - cudf.Series._from_column(index._columns[k]), - ) + num_selected = max(0, index.nlevels - size) # determine if we should downcast from a DataFrame to a Series need_downcast = ( @@ -814,16 +811,13 @@ def _index_and_downcast(self, result, index, index_key): result = cudf.Series._from_data( {}, name=tuple(col[0] for col in index._columns) ) - elif out_index._num_columns == 1: + elif num_selected == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - last_column = index._columns[-1] - out_index = cudf.Index._from_column( - last_column, name=index.names[-1] - ) - index = out_index - elif out_index._num_columns > 1: + *_, last_column = index._data.columns + index = cudf.Index._from_column(last_column, name=index.names[-1]) + elif num_selected > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 401fef67ee6..6e5abb2b82b 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -961,14 +961,14 @@ def _merge_sorted( ) -def _pivot(df, index, columns): +def _pivot(col_accessor: ColumnAccessor, index, columns) -> cudf.DataFrame: """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- - df : DataFrame + col_accessor : DataFrame index : cudf.Index Index labels of the result columns : cudf.Index @@ -985,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._column_labels_and_values: + for col_label, col in col_accessor.items(): names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1067,22 +1067,21 @@ def pivot(data, columns=None, index=no_default, values=no_default): 2 three """ - df = data values_is_list = True if values is no_default: - values = df._columns_view( - col for col in df._column_names if col not in (index, columns) - ) + cols_to_select = [ + col for col in data._column_names if col not in (index, columns) + ] + elif not isinstance(values, (list, tuple)): + cols_to_select = [values] + values_is_list = False else: - if not isinstance(values, (list, tuple)): - values = [values] - values_is_list = False - values = df._columns_view(values) + cols_to_select = values if index is no_default: - index = df.index + index = data.index else: - index = cudf.Index(df.loc[:, index]) - columns = cudf.Index(df.loc[:, columns]) + index = cudf.Index(data.loc[:, index]) + columns = cudf.Index(data.loc[:, columns]) # Create a DataFrame composed of columns from both # columns and index @@ -1096,7 +1095,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): if len(columns_index) != len(columns_index.drop_duplicates()): raise ValueError("Duplicate index-column pairs found. Cannot reshape.") - result = _pivot(values, index, columns) + result = _pivot(data._data.select_by_label(cols_to_select), index, columns) # MultiIndex to Index if not values_is_list: diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index ef0f6958aeb..094df955273 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np @@ -10,6 +10,9 @@ from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class ExponentialMovingWindow(_RollingBase): r""" @@ -179,8 +182,10 @@ def cov( ): raise NotImplementedError("cov not yet supported.") - def _apply_agg_series(self, sr, agg_name): - if not is_numeric_dtype(sr.dtype): + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + if not is_numeric_dtype(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only @@ -188,20 +193,14 @@ def _apply_agg_series(self, sr, agg_name): # pandas does nans in the same positions mathematically. # as such we need to convert the nans to nulls before # passing them in. - to_libcudf_column = sr._column.astype("float64").nans_to_nulls() - - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self( - [ - scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, - ) - ] - ) + to_libcudf_column = source_column.astype("float64").nans_to_nulls() + + return scan( + agg_name, + to_libcudf_column, + True, + com=self.com, + adjust=self.adjust, ) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 043a41145e5..967edc2ab15 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import TYPE_CHECKING import numba import pandas as pd @@ -16,25 +17,29 @@ from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class _RollingBase: """ - Contains methods common to all kinds of rolling + Contains routines to apply a window aggregation to a column. """ - def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + obj: cudf.DataFrame | cudf.Series - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + raise NotImplementedError + + def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series: + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible): @@ -290,14 +295,6 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg(self, agg_name): - applied = ( - self._apply_agg_column(col, agg_name) for col in self.obj._columns - ) - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self(applied) - ) - def _reduce( self, op: str, From 22d481a4e3a34d517ad9a9ac46b8b1b456d365c6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:45:47 -0400 Subject: [PATCH 21/26] Fix JsonLargeReaderTest.MultiBatch use of LIBCUDF_JSON_BATCH_SIZE env var (#16927) Fixes the `unsetenv` to use `LIBCUDF_JSON_BATCH_SIZE` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16927 --- cpp/tests/large_strings/json_tests.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu index 80bde168b75..a212d7d654a 100644 --- a/cpp/tests/large_strings/json_tests.cu +++ b/cpp/tests/large_strings/json_tests.cu @@ -96,5 +96,5 @@ TEST_F(JsonLargeReaderTest, MultiBatch) } // go back to normal batch_size - unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + unsetenv("LIBCUDF_JSON_BATCH_SIZE"); } From 6973ef806bc9d3cbda37a4c7caa763da12b84b7f Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 27 Sep 2024 16:50:15 -0400 Subject: [PATCH 22/26] Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter (#16923) Addresses #16915 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16923 --- cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/tests/io/json/json_test.cpp | 24 ++++ cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 1c15e147b13..bf81162a0ac 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId { constexpr auto pda_sgid_lookup_size = static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); // We map the delimiter character to LINE_BREAK symbol group id, and the newline character - // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, + // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. auto const symbol_position = symbol == delimiter ? static_cast('\n') - : (symbol == '\n' ? static_cast(delimiter) : static_cast(symbol)); + : (symbol == '\n' ? static_cast(' ') : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 68ec255b39d..a094ac7d772 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter) EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument); } +TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) +{ + // Test input + std::string input = R"({"a": + 100})"; + + cudf::io::json_reader_options json_parser_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) + .lines(true) + .delimiter('\0'); + + auto result = cudf::io::read_json(json_parser_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 1); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + + auto col1_iterator = thrust::constant_iterator(100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper(col1_iterator, col1_iterator + 1)); +} + // Test case for dtype prune: // all paths, only one. // one present, another not present, nothing present diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 327169ae563..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter) } } +TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + char const delimiter = GetParam(); + + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as + * a delimiter for JSONL) + * {"a":2} + * {"a":{"a":{"a":[321{"a":[1]} + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + std::string input = R"({"a":2})" + "\n"; + // starting position 8 (zero indexed) + input += R"({"a":)" + std::string(1, delimiter); + // starting position 14 (zero indexed) + input += R"({"a":{"a":[321)" + std::string(1, delimiter); + // starting position 29 (zero indexed) + input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter); + // starting position 41 (zero indexed) + input += R"({"b":123})" + "\n"; + // starting position 51 (zero indexed) + input += R"({"b":123})" + std::string(1, delimiter); + // starting position 61 (zero indexed) + input += R"({"b")" + std::string("\n:\n\n\n123\n}"); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream; + if (delimiter != '\n') { + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 3 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 4 (valid) + {61, token_t::StructBegin}, + {62, token_t::StructMemberBegin}, + {62, token_t::FieldNameBegin}, + {64, token_t::FieldNameEnd}, + {70, token_t::ValueBegin}, + {73, token_t::ValueEnd}, + {74, token_t::StructMemberEnd}, + {74, token_t::StructEnd}}; + } else { + /* Input: + * {"a":2} + * {"a": + * {"a":{"a":[321 + * {"a":[1]} + * + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 4 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 5 (valid) + {51, token_t::StructBegin}, + {52, token_t::StructMemberBegin}, + {52, token_t::FieldNameBegin}, + {54, token_t::FieldNameEnd}, + {56, token_t::ValueBegin}, + {59, token_t::ValueEnd}, + {59, token_t::StructMemberEnd}, + {59, token_t::StructEnd}, + // Line 6 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}}; + } + + auto const stream = cudf::get_default_stream(); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Default parsing options + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .delimiter(delimiter) + .lines(true); + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, in_opts, stream, cudf::get_current_device_resource_ref()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + stream.synchronize(); + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + CUDF_TEST_PROGRAM_MAIN() From 1ad8dedfb062312a07d9100322d13c1d43470148 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 20:14:52 -0700 Subject: [PATCH 23/26] Apply clang-tidy autofixes (#16949) With the last few clang-tidy PRs merged, clang-tidy now completes successfully enough that autofixes work again. This is the result of an autofix pass. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16949 --- cpp/.clang-tidy | 3 +- cpp/include/cudf/io/parquet.hpp | 2 +- cpp/include/cudf_test/type_list_utilities.hpp | 5 +-- cpp/src/io/functions.cpp | 5 ++- cpp/src/io/utilities/output_builder.cuh | 4 +- cpp/tests/ast/transform_tests.cpp | 6 +-- cpp/tests/binaryop/util/operation.h | 44 +++++++++---------- cpp/tests/groupby/mean_tests.cpp | 4 +- cpp/tests/io/json/json_test.cpp | 3 +- cpp/tests/io/parquet_common.hpp | 6 +-- cpp/tests/io/text/multibyte_split_test.cpp | 2 +- 11 files changed, 41 insertions(+), 43 deletions(-) diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index d766d98b45e..b791d846d1d 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -3,7 +3,8 @@ Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, - -modernize-use-trailing-return-type' + -modernize-use-trailing-return-type, + -modernize-use-bool-literals' # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ee03a382bec..bfe76d5690c 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1200,7 +1200,7 @@ class parquet_writer_options : public parquet_writer_options_base { * @param sink The sink used for writer output * @param table Table to be written to output */ - explicit parquet_writer_options(sink_info const& sink, table_view const& table); + explicit parquet_writer_options(sink_info const& sink, table_view table); public: /** diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp index 1793a8ecce0..3c96c59f0b7 100644 --- a/cpp/include/cudf_test/type_list_utilities.hpp +++ b/cpp/include/cudf_test/type_list_utilities.hpp @@ -414,9 +414,8 @@ struct RemoveIfImpl> { template struct RemoveIfImpl> { - using type = - Concat::value, Types<>, Types>::type, - typename RemoveIfImpl>::type>; + using type = Concat::value, Types<>, Types>, + typename RemoveIfImpl>::type>; }; // @endcond diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 0ca54da5aaf..de8eea9e99b 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -38,6 +38,7 @@ #include #include +#include namespace cudf::io { @@ -852,8 +853,8 @@ void parquet_writer_options_base::set_sorting_columns(std::vector gather(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + [[nodiscard]] rmm::device_uvector gather(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { rmm::device_uvector output{size(), stream, mr}; auto output_it = output.begin(); diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 6b350c137d0..a4bde50a21e 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -378,7 +378,7 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) auto expressions = std::list(); auto op = arithmetic_operator; - expressions.push_back(cudf::ast::operation(op, col_ref, col_ref)); + expressions.emplace_back(op, col_ref, col_ref); for (int64_t i = 0; i < depth_level - 1; i++) { if (i == depth_level - 2) { @@ -387,9 +387,9 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression) op = arithmetic_operator; } if (nested_left_tree) { - expressions.push_back(cudf::ast::operation(op, expressions.back(), col_ref)); + expressions.emplace_back(op, expressions.back(), col_ref); } else { - expressions.push_back(cudf::ast::operation(op, col_ref, expressions.back())); + expressions.emplace_back(op, col_ref, expressions.back()); } } return expressions; diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index c900c4c558c..d36b48d666a 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -48,7 +48,7 @@ struct Add { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) + static_cast(rhs)); } }; @@ -72,7 +72,7 @@ struct Sub { void>* = nullptr> OutT operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) - static_cast(rhs)); } }; @@ -83,7 +83,7 @@ struct Mul { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) const { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) * static_cast(rhs)); } @@ -112,7 +112,7 @@ struct Div { std::enable_if_t::value, void>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) / static_cast(rhs)); } @@ -191,33 +191,31 @@ struct FloorDiv { template struct Mod { - template ::type>)>* = nullptr> + template >)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { - using TypeCommon = typename std::common_type::type; + using TypeCommon = std::common_type_t; return static_cast(static_cast(lhs) % static_cast(rhs)); } - template ::type, float>)>* = nullptr> + template < + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, float>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); } template < - typename OutT = TypeOut, - typename LhsT = TypeLhs, - typename RhsT = TypeRhs, - std::enable_if_t<(std::is_same_v::type, double>)>* = - nullptr> + typename OutT = TypeOut, + typename LhsT = TypeLhs, + typename RhsT = TypeRhs, + std::enable_if_t<(std::is_same_v, double>)>* = nullptr> TypeOut operator()(TypeLhs lhs, TypeRhs rhs) { return static_cast(fmod(static_cast(lhs), static_cast(rhs))); @@ -326,7 +324,7 @@ struct LogBase { template struct PMod { - using CommonArgsT = typename std::common_type::type; + using CommonArgsT = std::common_type_t; TypeOut operator()(TypeLhs x, TypeRhs y) const { @@ -351,8 +349,8 @@ struct PyMod { TypeOut operator()(TypeLhs x, TypeRhs y) const { if constexpr (std::is_floating_point_v or std::is_floating_point_v) { - double x1 = static_cast(x); - double y1 = static_cast(y); + auto x1 = static_cast(x); + auto y1 = static_cast(y); return fmod(fmod(x1, y1) + y1, y1); } else { return ((x % y) + y) % y; diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp index 0cb5ee30a8b..e9c72293649 100644 --- a/cpp/tests/groupby/mean_tests.cpp +++ b/cpp/tests/groupby/mean_tests.cpp @@ -49,7 +49,7 @@ TYPED_TEST(groupby_mean_test, basic) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -114,7 +114,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values) { using V = TypeParam; using R = cudf::detail::target_type_t; - using RT = typename std::conditional(), int, double>::type; + using RT = std::conditional_t(), int, double>; cudf::test::fixed_width_column_wrapper keys( {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index a094ac7d772..49ad0c408dc 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -858,8 +858,7 @@ TEST_P(JsonReaderRecordTest, JsonLinesObjects) TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings) { - auto const test_opt = GetParam(); - auto test_json_objects = [test_opt](std::string const& data) { + auto test_json_objects = [](std::string const& data) { cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true); diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bc6145d77da..bd1579eaa1b 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -35,9 +35,9 @@ template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 408d54bd5ff..74d08061df9 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -145,7 +145,7 @@ TEST_F(MultibyteSplitTest, LargeInput) for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { host_input += "...:|"; - host_expected.emplace_back(std::string("...:|")); + host_expected.emplace_back("...:|"); } auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()}; From e2bcbb880f540987eb3fbd0fede9fed826ea2fdf Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 27 Sep 2024 23:21:53 -0700 Subject: [PATCH 24/26] Rework `read_csv` IO to avoid reading whole input with a single `host_read` (#16826) Issue https://github.com/rapidsai/cudf/issues/13797 The CSV reader ingests all input data with single call to host_read. This is a problem for a few reasons: 1. With `cudaHostRegister` we cannot reliably copy from the mapped region to the GPU without issues with mixing registered and unregistered areas. The reader can't know the datasource implementation details needed to avoid this issue. 2. Currently the reader performs the H2D copies manually, so there's no multi-threaded or pinned memory optimizations. Using `device_read` has the potential to outperform manual copies. This PR changes `read_csv` IO to perform small `host_read`s to get the data like BOM and first row. Most of the data is then read in chunks using `device_read` calls. We can further remove host_reads by moving some of the host processing to the GPU. No significant changes in performance. We are likely to get performance improvements from future changes like increasing the kvikIO thread pool size. Authors: - Vukasin Milovanovic (https://github.com/vuule) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - MithunR (https://github.com/mythrocks) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/16826 --- cpp/src/io/csv/reader_impl.cu | 299 ++++++++++++++++++++-------------- 1 file changed, 178 insertions(+), 121 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index ebca334a715..8c32fc85f78 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -46,11 +46,8 @@ #include #include -#include #include -#include #include -#include #include #include #include @@ -88,7 +85,7 @@ class selected_rows_offsets { : all{std::move(data)}, selected{selected_span} { } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} + explicit selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} operator device_span() const { return selected; } void shrink(size_t size) @@ -196,15 +193,11 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } -size_t find_first_row_start(char row_terminator, host_span data) +constexpr std::array UTF8_BOM = {0xEF, 0xBB, 0xBF}; +[[nodiscard]] bool has_utf8_bom(host_span data) { - // For now, look for the first terminator (assume the first terminator isn't within a quote) - // TODO: Attempt to infer this from the data - size_t pos = 0; - while (pos < data.size() && data[pos] != row_terminator) { - ++pos; - } - return std::min(pos + 1, data.size()); + return data.size() >= UTF8_BOM.size() && + memcmp(data.data(), UTF8_BOM.data(), UTF8_BOM.size()) == 0; } /** @@ -213,20 +206,28 @@ size_t find_first_row_start(char row_terminator, host_span data) * This function scans the input data to record the row offsets (relative to the start of the * input data). A row is actually the data/offset between two termination symbols. * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory + * @param[in] source The source data (may be compressed) + * @param[in] reader_opts Settings for controlling reading behavior + * @param[in] parse_opts Settings for controlling parsing behavior + * @param[out] header The header row, if any + * @param[in] data Host buffer containing uncompressed data, if input is compressed + * @param[in] byte_range_offset Offset of the byte range + * @param[in] range_begin Start of the first row, relative to the byte range start + * @param[in] range_end End of the data to read, relative to the byte range start; equal to the + * data size if all data after byte_range_offset needs to be read + * @param[in] skip_rows Number of rows to skip from the start + * @param[in] num_rows Number of rows to read; -1 means all + * @param[in] load_whole_file Indicates if the whole file should be read + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory */ std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + cudf::io::datasource* source, csv_reader_options const& reader_opts, parse_options const& parse_opts, std::vector& header, - host_span data, + std::optional> data, + size_t byte_range_offset, size_t range_begin, size_t range_end, size_t skip_rows, @@ -235,50 +236,81 @@ std::pair, selected_rows_offsets> load_data_and_gather rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB - size_t buffer_size = std::min(max_chunk_bytes, data.size()); - size_t max_blocks = - std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); - cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); - size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); - size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; - uint64_t ctx = 0; + + auto const data_size = data.has_value() ? data->size() : source->size(); + auto const buffer_size = std::min(max_chunk_bytes, data_size); + auto const max_input_size = [&] { + if (range_end == data_size) { + return data_size - byte_range_offset; + } else { + return std::min(reader_opts.get_byte_range_size_with_padding(), + data_size - byte_range_offset); + } + }(); + auto const header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; // For compatibility with the previous parser, a row is considered in-range if the // previous row terminator is within the given range - range_end += (range_end < data.size()); + range_end += (range_end < data_size); - // Reserve memory by allocating and then resetting the size - rmm::device_uvector d_data{ - (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream}; - d_data.resize(0, stream); + auto pos = range_begin; + // When using byte range, need the line terminator of last line before the range + auto input_pos = byte_range_offset == 0 ? pos : pos - 1; + uint64_t ctx = 0; + + rmm::device_uvector d_data{0, stream}; + d_data.reserve((load_whole_file) ? data_size : std::min(buffer_size * 2, max_input_size), stream); rmm::device_uvector all_row_offsets{0, stream}; + + auto const max_blocks = + std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); + cudf::detail::hostdevice_vector row_ctx(max_blocks, stream); do { - size_t target_pos = std::min(pos + max_chunk_bytes, data.size()); - size_t chunk_size = target_pos - pos; + auto const target_pos = std::min(pos + max_chunk_bytes, max_input_size); + auto const chunk_size = target_pos - pos; auto const previous_data_size = d_data.size(); - d_data.resize(target_pos - buffer_pos, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, - data.begin() + buffer_pos + previous_data_size, - target_pos - buffer_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); + d_data.resize(target_pos - input_pos, stream); + + auto const read_offset = byte_range_offset + input_pos + previous_data_size; + auto const read_size = target_pos - input_pos - previous_data_size; + if (data.has_value()) { + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, + data->data() + read_offset, + target_pos - input_pos - previous_data_size, + cudaMemcpyDefault, + stream.value())); + } else { + if (source->is_device_read_preferred(read_size)) { + source->device_read(read_offset, + read_size, + reinterpret_cast(d_data.data() + previous_data_size), + stream); + } else { + auto const buffer = source->host_read(read_offset, read_size); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, + buffer->data(), + buffer->size(), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); // To prevent buffer going out of scope before we copy the data. + } + } // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), - row_ctx.device_ptr(), - device_span(), - d_data, - chunk_size, - pos, - buffer_pos, - data.size(), - range_begin, - range_end, - skip_rows, - stream); + auto const num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), + row_ctx.device_ptr(), + device_span(), + d_data, + chunk_size, + pos, + input_pos, + max_input_size, + range_begin, + range_end, + skip_rows, + stream); CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), @@ -312,14 +344,14 @@ std::pair, selected_rows_offsets> load_data_and_gather d_data, chunk_size, pos, - buffer_pos, - data.size(), + input_pos, + max_input_size, range_begin, range_end, skip_rows, stream); // With byte range, we want to keep only one row out of the specified range - if (range_end < data.size()) { + if (range_end < data_size) { CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_ctx.device_ptr(), num_blocks * sizeof(uint64_t), @@ -356,18 +388,18 @@ std::pair, selected_rows_offsets> load_data_and_gather size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char); if (discard_bytes != 0) { erase_except_last(d_data, stream); - buffer_pos += discard_bytes; + input_pos += discard_bytes; } } pos = target_pos; - } while (pos < data.size()); + } while (pos < max_input_size); auto const non_blank_row_offsets = io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; // Remove header rows and extract header - size_t const header_row_index = std::max(header_rows, 1) - 1; + auto const header_row_index = std::max(header_rows, 1) - 1; if (header_row_index + 1 < row_offsets.size()) { CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), row_offsets.data() + header_row_index, @@ -376,11 +408,20 @@ std::pair, selected_rows_offsets> load_data_and_gather stream.value())); stream.synchronize(); - auto const header_start = buffer_pos + row_ctx[0]; - auto const header_end = buffer_pos + row_ctx[1]; - CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), + auto const header_start = input_pos + row_ctx[0]; + auto const header_end = input_pos + row_ctx[1]; + CUDF_EXPECTS(header_start <= header_end && header_end <= max_input_size, "Invalid csv header location"); - header.assign(data.begin() + header_start, data.begin() + header_end); + header.resize(header_end - header_start); + if (data.has_value()) { + std::copy(data->begin() + byte_range_offset + header_start, + data->begin() + byte_range_offset + header_end, + header.begin()); + } else { + source->host_read(header_start + byte_range_offset, + header_end - header_start, + reinterpret_cast(header.data())); + } if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } } // Apply num_rows limit @@ -397,73 +438,89 @@ std::pair, selected_rows_offsets> select_data_and_row_ parse_options const& parse_opts, rmm::cuda_stream_view stream) { - auto range_offset = reader_opts.get_byte_range_offset(); - auto range_size = reader_opts.get_byte_range_size(); - auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - auto skip_rows = reader_opts.get_skiprows(); - auto skip_end_rows = reader_opts.get_skipfooter(); - auto num_rows = reader_opts.get_nrows(); + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto skip_rows = reader_opts.get_skiprows(); + auto skip_end_rows = reader_opts.get_skipfooter(); + auto num_rows = reader_opts.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + // TODO: Allow parsing the header outside the mapped range + CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), + "byte_range offset with header not supported"); - // Transfer source data to GPU - if (!source->is_empty()) { - auto buffer = - source->host_read(range_offset, range_size_padded != 0 ? range_size_padded : source->size()); - auto h_data = - host_span(reinterpret_cast(buffer->data()), buffer->size()); - - std::vector h_uncomp_data_owner; - if (reader_opts.get_compression() != compression_type::NONE) { - h_uncomp_data_owner = - decompress(reader_opts.get_compression(), {buffer->data(), buffer->size()}); - h_data = {reinterpret_cast(h_uncomp_data_owner.data()), - h_uncomp_data_owner.size()}; - buffer.reset(); - } + if (source->is_empty()) { + return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + } - // check for and skip UTF-8 BOM - uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF}; - if (h_data.size() >= sizeof(UTF8_BOM) && - memcmp(h_data.data(), UTF8_BOM, sizeof(UTF8_BOM)) == 0) { - h_data = h_data.subspan(sizeof(UTF8_BOM), h_data.size() - sizeof(UTF8_BOM)); - } + std::optional> h_data; + std::vector h_uncomp_data_owner; + if (reader_opts.get_compression() != compression_type::NONE) { + auto const h_comp_data = source->host_read(0, source->size()); + h_uncomp_data_owner = + decompress(reader_opts.get_compression(), {h_comp_data->data(), h_comp_data->size()}); + h_data = host_span{reinterpret_cast(h_uncomp_data_owner.data()), + h_uncomp_data_owner.size()}; + } - // None of the parameters for row selection is used, we are parsing the entire file - bool const load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && - skip_end_rows <= 0 && num_rows == -1; - - // With byte range, find the start of the first data row - size_t const data_start_offset = - (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0; - - // TODO: Allow parsing the header outside the mapped range - CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), - "byte_range offset with header not supported"); - - // Gather row offsets - auto data_row_offsets = - load_data_and_gather_row_offsets(reader_opts, - parse_opts, - header, - h_data, - data_start_offset, - (range_size) ? range_size : h_data.size(), - (skip_rows > 0) ? skip_rows : 0, - num_rows, - load_whole_file, - stream); - auto& row_offsets = data_row_offsets.second; - // Exclude the rows that are to be skipped from the end - if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { - row_offsets.shrink(row_offsets.size() - skip_end_rows); + size_t data_start_offset = range_offset; + if (h_data.has_value()) { + if (has_utf8_bom(*h_data)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + if (range_offset == 0) { + auto bom_buffer = source->host_read(0, std::min(source->size(), sizeof(UTF8_BOM))); + auto bom_chars = host_span{reinterpret_cast(bom_buffer->data()), + bom_buffer->size()}; + if (has_utf8_bom(bom_chars)) { data_start_offset += sizeof(UTF8_BOM); } + } else { + auto find_data_start_chunk_size = 1024ul; + while (data_start_offset < source->size()) { + auto const read_size = + std::min(find_data_start_chunk_size, source->size() - data_start_offset); + auto buffer = source->host_read(data_start_offset, read_size); + auto buffer_chars = + host_span{reinterpret_cast(buffer->data()), buffer->size()}; + + if (auto first_row_start = + std::find(buffer_chars.begin(), buffer_chars.end(), parse_opts.terminator); + first_row_start != buffer_chars.end()) { + data_start_offset += std::distance(buffer_chars.begin(), first_row_start) + 1; + break; + } + data_start_offset += read_size; + find_data_start_chunk_size *= 2; + } } - return data_row_offsets; } - return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; + + // None of the parameters for row selection is used, we are parsing the entire file + bool const load_whole_file = + range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; + + // Transfer source data to GPU and gather row offsets + auto const uncomp_size = h_data.has_value() ? h_data->size() : source->size(); + auto data_row_offsets = load_data_and_gather_row_offsets(source, + reader_opts, + parse_opts, + header, + h_data, + range_offset, + data_start_offset - range_offset, + (range_size) ? range_size : uncomp_size, + (skip_rows > 0) ? skip_rows : 0, + num_rows, + load_whole_file, + stream); + auto& row_offsets = data_row_offsets.second; + // Exclude the rows that are to be skipped from the end + if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { + row_offsets.shrink(row_offsets.size() - skip_end_rows); + } + + return data_row_offsets; } void select_data_types(host_span user_dtypes, From 2b6408b5989dd7a52abe3cef3c5af7ff4681ee56 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 30 Sep 2024 11:10:51 +0100 Subject: [PATCH 25/26] Fix order-preservation in cudf-polars groupby (#16907) When we are requested to maintain order in groupby aggregations we must post-process the result by computing a permutation between the wanted order (of the input keys) and the order returned by the groupby aggregation. To do this, we can perform a join between the two unique key tables. Previously, we assumed that the gather map returned in this join for the left (wanted order) table was the identity. However, this is not guaranteed, in addition to computing the match between the wanted key order and the key order we have, we must also apply the permutation between the left gather map order and the identity. - Closes #16893 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16907 --- python/cudf_polars/cudf_polars/dsl/ir.py | 31 ++++++++++++++++++------ python/cudf_polars/tests/test_groupby.py | 22 +++++++++++++++++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 8cd56c8ee3a..1c61075be22 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -603,24 +603,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] broadcasted = broadcast(*result_keys, *results) - result_keys = broadcasted[: len(result_keys)] - results = broadcasted[len(result_keys) :] # Handle order preservation of groups - # like cudf classic does - # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 if self.maintain_order and not sorted: - left = plc.stream_compaction.stable_distinct( + # The order we want + want = plc.stream_compaction.stable_distinct( plc.Table([k.obj for k in keys]), list(range(group_keys.num_columns())), plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) - right = plc.Table([key.obj for key in result_keys]) - _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + # The order we have + have = plc.Table([key.obj for key in broadcasted[: len(keys)]]) + + # We know an inner join is OK because by construction + # want and have are permutations of each other. + left_order, right_order = plc.join.inner_join( + want, have, plc.types.NullEquality.EQUAL + ) + # Now left_order is an arbitrary permutation of the ordering we + # want, and right_order is a matching permutation of the ordering + # we have. To get to the original ordering, we need + # left_order == iota(nrows), with right_order permuted + # appropriately. This can be obtained by sorting + # right_order by left_order. + (right_order,) = plc.sorting.sort_by_key( + plc.Table([right_order]), + plc.Table([left_order]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ).columns() ordered_table = plc.copying.gather( plc.Table([col.obj for col in broadcasted]), - indices, + right_order, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) broadcasted = [ diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 74bf8b9e4e2..1e8246496cd 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -4,6 +4,7 @@ import itertools +import numpy as np import pytest import polars as pl @@ -191,3 +192,24 @@ def test_groupby_literal_in_agg(df, key, expr): def test_groupby_unary_non_pointwise_raises(df, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = pl.DataFrame(dict(zip(key_names, key_values, strict=True), value=value)) + if with_nulls: + df = df.with_columns( + *( + pl.when(pl.col(name) == 1) + .then(None) + .otherwise(pl.col(name)) + .alias(name) + for name in key_names + ) + ) + q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum()) + assert_gpu_result_equal(q) From 9b2f892c5ec59605bfdc3a2abe4885176950589a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 30 Sep 2024 11:12:17 +0100 Subject: [PATCH 26/26] Fix order-preservation in pandas-compat unsorted groupby (#16942) When sort=False is requested in groupby aggregations and pandas compatibility mode is enabled, we are on the hook to produce the grouped aggregation result in an order which matches the input key order. We previously nearly did this, but the reordering relied on the (incorrect) assumption that when joining two tables with a left join, the resulting gather map for the left table is the identity. This is not the case. To fix this, we must permute the right (result) table gather map by the ordering that makes the left map the identity (AKA, sort by key with the left map as keys) before permuting the result. While here, replace the (bounds-checking) IndexedFrame.take call with usage of the internal (non-bounds-checking) _gather method. This avoids a redundant reduction over the indices, since by construction they are in bounds. - Closes #16908 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/16942 --- python/cudf/cudf/core/groupby/groupby.py | 27 +++++++++++++---- .../groupby/test_ordering_pandas_compat.py | 29 +++++++++++++++++++ 2 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index be05075a2cd..81b20488d8d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -27,6 +27,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor +from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): left_cols = list(self.grouping.keys.drop_duplicates()._columns) right_cols = list(result_index._columns) join_keys = [ - _match_join_keys(lcol, rcol, "left") + _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip(left_cols, right_cols) ] # TODO: In future, see if we can centralize # logic else where that has similar patterns. join_keys = map(list, zip(*join_keys)) - _, indices = libcudf.join.join( - *join_keys, - how="left", + # By construction, left and right keys are related by + # a permutation, so we can use an inner join. + left_order, right_order = libcudf.join.join( + *join_keys, how="inner" + ) + # left order is some permutation of the ordering we + # want, and right order is a matching gather map for + # the result table. Get the correct order by sorting + # the right gather map. + (right_order,) = libcudf.sort.sort_by_key( + [right_order], + [left_order], + [True], + ["first"], + stable=False, + ) + result = result._gather( + GatherMap.from_column_unchecked( + right_order, len(result), nullify=False + ) ) - result = result.take(indices) if not self._as_index: result = result.reset_index() diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py new file mode 100644 index 00000000000..a009802bab0 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) +def with_nulls(request): + return request.param + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls)