From 19846b6c0ac40fc91ad28573af04ac7403754acb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:15:03 -1000 Subject: [PATCH] Disallow cudf.Index accepting column in favor of ._from_column (#16549) Similar to https://github.com/rapidsai/cudf/pull/16454, this PR disallows the public `cudf.Index` accepting a private `ColumnBase` object in favor of `_from_column` (which was added in the linked PR) Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16549 --- python/cudf/cudf/_lib/parquet.pyx | 4 +- python/cudf/cudf/_lib/utils.pyx | 6 +- python/cudf/cudf/api/types.py | 2 +- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/algorithms.py | 6 +- python/cudf/cudf/core/column/categorical.py | 8 +- python/cudf/cudf/core/column/datetime.py | 10 +- python/cudf/cudf/core/column/methods.py | 6 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/cut.py | 2 +- python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/core/dtypes.py | 14 +- python/cudf/cudf/core/groupby/groupby.py | 9 +- python/cudf/cudf/core/index.py | 238 ++++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 24 +- python/cudf/cudf/core/multiindex.py | 7 +- python/cudf/cudf/core/resample.py | 4 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/core/tools/datetimes.py | 16 +- python/cudf/cudf/testing/testing.py | 8 +- python/cudf/cudf/tests/test_multiindex.py | 4 +- python/cudf/cudf/tests/test_string.py | 2 +- 22 files changed, 232 insertions(+), 154 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 4a4b13b0b31..0fffb6ade58 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -222,7 +222,7 @@ cdef object _process_metadata(object df, if len(filtered_idx) > 0: idx = cudf.concat(filtered_idx) else: - idx = cudf.Index(cudf.core.column.column_empty(0)) + idx = cudf.Index._from_column(cudf.core.column.column_empty(0)) else: start = range_index_meta["start"] + skip_rows stop = range_index_meta["stop"] @@ -240,7 +240,7 @@ cdef object _process_metadata(object df, index_data = df[index_col] actual_index_names = list(index_col_names.values()) if len(index_data._data) == 1: - idx = cudf.Index( + idx = cudf.Index._from_column( index_data._data.columns[0], name=actual_index_names[0] ) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index f136cd997a7..267432a0182 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -93,12 +93,12 @@ cpdef generate_pandas_metadata(table, index): materialize_index = False if index is not False: for level, name in enumerate(table._index.names): - if isinstance(table._index, cudf.core.multiindex.MultiIndex): + if isinstance(table._index, cudf.MultiIndex): idx = table.index.get_level_values(level) else: idx = table.index - if isinstance(idx, cudf.core.index.RangeIndex): + if isinstance(idx, cudf.RangeIndex): if index is None: descr = { "kind": "range", @@ -110,7 +110,7 @@ cpdef generate_pandas_metadata(table, index): else: materialize_index = True # When `index=True`, RangeIndex needs to be materialized. - materialized_idx = cudf.Index(idx._values, name=idx.name) + materialized_idx = idx._as_int_index() descr = _index_level_name( index_name=materialized_idx.name, level=level, diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 294ae2fd985..9c436dfad18 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -249,7 +249,7 @@ def _union_categoricals( new_categories=sorted_categories ) - return cudf.Index(result_col) + return cudf.CategoricalIndex._from_column(result_col) def is_bool_dtype(arr_or_dtype): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c91514202c5..d13351c49dd 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1979,7 +1979,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default): name=index.name, ) else: - return cudf.Index( + return cudf.Index._from_column( column.as_column(index, nan_as_null=nan_as_null), name=index.name, ) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 6c69fbd2637..e27d6ec8d3e 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -8,7 +8,7 @@ import numpy as np from cudf.core.column import as_column -from cudf.core.index import RangeIndex, ensure_index +from cudf.core.index import Index, RangeIndex from cudf.core.scalar import Scalar from cudf.options import get_option from cudf.utils.dtypes import can_convert_to_column @@ -112,7 +112,9 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else ensure_index(cats) + return labels, cats.values if return_cupy_array else Index._from_column( + cats + ) def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6fa69eb9cc1..d25983842f9 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -601,11 +601,13 @@ def __setitem__(self, key, value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): - arr = [value] + arr = column.as_column(value, length=1, nan_as_null=False) else: - arr = value + arr = column.as_column(value, nan_as_null=False) to_add_categories = len( - cudf.Index(arr, nan_as_null=False).difference(self.categories) + cudf.Index._from_column(arr).difference( + cudf.Index._from_column(self.categories) + ) ) if to_add_categories > 0: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index ce67ce81e6b..1dbc94384d3 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -250,6 +250,10 @@ def __contains__(self, item: ScalarLike) -> bool: def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] + @property + def quarter(self) -> ColumnBase: + return libcudf.datetime.extract_quarter(self) + @property def year(self) -> ColumnBase: return self.get_dt_field("year") @@ -308,7 +312,7 @@ def is_quarter_start(self) -> ColumnBase: @property def is_year_end(self) -> ColumnBase: day_of_year = self.day_of_year - leap_dates = libcudf.datetime.is_leap_year(self) + leap_dates = self.is_leap_year leap = day_of_year == cudf.Scalar(366) non_leap = day_of_year == cudf.Scalar(365) @@ -316,6 +320,10 @@ def is_year_end(self) -> ColumnBase: False ) + @property + def is_leap_year(self) -> ColumnBase: + return libcudf.datetime.is_leap_year(self) + @property def is_year_start(self) -> ColumnBase: return (self.day_of_year == 1).fillna(False) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 8c46d238057..05a0ab2e09a 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -65,8 +65,8 @@ def _return_or_inplace( """ if inplace: self._parent._mimic_inplace( - self._parent.__class__._from_data( - {self._parent.name: new_col} + type(self._parent)._from_column( + new_col, name=self._parent.name ), inplace=True, ) @@ -92,6 +92,6 @@ def _return_or_inplace( index=self._parent.index if retain_index else None, ) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index(new_col, name=self._parent.name) + return cudf.Index._from_column(new_col, name=self._parent.name) else: return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1a4b558749d..a710a9f46c2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4693,7 +4693,7 @@ def character_tokenize(self) -> SeriesOrIndex: result_col, name=self._parent.name, index=index ) elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index(result_col, name=self._parent.name) + return cudf.Index._from_column(result_col, name=self._parent.name) else: return result_col diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 197f46ee9fe..a4ceea266b4 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -292,7 +292,7 @@ def cut( ) # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.CategoricalIndex._from_data({None: col}) + categorical_index = cudf.CategoricalIndex._from_column(col) if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a53c7bcc63c..3033abd53f5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -326,7 +326,7 @@ def _getitem_tuple_arg(self, arg): range(len(tmp_arg[0])) ) }, - index=cudf.Index(tmp_arg[0]), + index=cudf.Index._from_column(tmp_arg[0]), ) columns_df[cantor_name] = column.as_column( range(len(columns_df)) @@ -1758,7 +1758,7 @@ def _concat( for cols in columns: table_index = None if 1 == first_data_column_position: - table_index = cudf.Index(cols[0]) + table_index = cudf.Index._from_column(cols[0]) elif first_data_column_position > 1: table_index = cudf.MultiIndex._from_data( data=dict( @@ -1810,7 +1810,7 @@ def _concat( if not isinstance(out.index, MultiIndex) and isinstance( out.index.dtype, cudf.CategoricalDtype ): - out = out.set_index(cudf.Index(out.index._values)) + out = out.set_index(out.index) for name, col in out._data.items(): out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype @@ -3007,7 +3007,7 @@ def set_index( and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex)) ): # Don't turn single level MultiIndex into an Index - idx = cudf.Index(data_to_add[0], name=names[0]) + idx = cudf.Index._from_column(data_to_add[0], name=names[0]) else: idx = MultiIndex._from_data(dict(enumerate(data_to_add))) idx.names = names diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 27afec18b4e..6d532e01cba 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -182,7 +182,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None: self._ordered = ordered @property - def categories(self) -> "cudf.core.index.Index": + def categories(self) -> cudf.Index: """ An ``Index`` containing the unique categories allowed. @@ -194,10 +194,12 @@ def categories(self) -> "cudf.core.index.Index": Index(['b', 'a'], dtype='object') """ if self._categories is None: - return cudf.Index( - cudf.core.column.column_empty(0, dtype="object", masked=False) + col = cudf.core.column.column_empty( + 0, dtype="object", masked=False ) - return cudf.Index(self._categories, copy=False) + else: + col = self._categories + return cudf.Index._from_column(col) @property def type(self): @@ -259,7 +261,9 @@ def to_pandas(self) -> pd.CategoricalDtype: categories = self._categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered) - def _init_categories(self, categories: Any): + def _init_categories( + self, categories: Any + ) -> cudf.core.column.ColumnBase | None: if categories is None: return categories if len(categories) == 0 and not isinstance( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 9b71ea57f1f..4f283d41b17 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -403,8 +403,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: if len(group_keys) > 1: index = cudf.MultiIndex.from_arrays(group_keys) else: - (group_keys,) = group_keys - index = cudf.Index(group_keys) + index = cudf.Index._from_column(group_keys[0]) return dict( zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) ) @@ -2583,7 +2582,7 @@ def _mimic_pandas_order( # corresponding output rows in pandas, to do that here # expand the result by reindexing. ri = cudf.RangeIndex(0, len(self.obj)) - result.index = cudf.Index(ordering) + result.index = cudf.Index._from_column(ordering) # This reorders and expands result = result.reindex(ri) else: @@ -3154,7 +3153,9 @@ def keys(self): dict(zip(range(nkeys), self._key_columns)) )._set_names(self.names) else: - return cudf.Index(self._key_columns[0], name=self.names[0]) + return cudf.Index._from_column( + self._key_columns[0], name=self.names[0] + ) @property def values(self) -> cudf.core.frame.Frame: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3eab27bd165..c55f86d48e1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -18,7 +18,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.datetime import extract_quarter, is_leap_year from cudf._lib.filling import sequence from cudf._lib.search import search_sorted from cudf._lib.types import size_type_dtype @@ -819,22 +818,23 @@ def sort_values( @_performance_tracking def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return cudf.Index._from_data( - {self.name: self._values.take(gather_map, nullify, check_bounds)} + return cudf.Index._from_column( + self._column.take(gather_map, nullify, check_bounds), + name=self.name, ) @_performance_tracking def _apply_boolean_mask(self, boolean_mask): - return cudf.Index._from_data( - {self.name: self._values.apply_boolean_mask(boolean_mask)} + return cudf.Index._from_column( + self._column.apply_boolean_mask(boolean_mask), name=self.name ) def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_data( - {self.name: self._as_int_index()._split(splits)} + return cudf.Index._from_column( + self._as_int_index()._split(splits), name=self.name ) def _binaryop(self, other, op: str): @@ -1087,10 +1087,13 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def _from_column( cls, column: ColumnBase, *, name: Hashable = None ) -> Self: - ca = cudf.core.column_accessor.ColumnAccessor( - {name: column}, verify=False - ) - return _index_from_data(ca) + if cls is Index: + ca = cudf.core.column_accessor.ColumnAccessor( + {name: column}, verify=False + ) + return _index_from_data(ca) + else: + return super()._from_column(column, name=name) @classmethod @_performance_tracking @@ -1223,8 +1226,8 @@ def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in non_empties): result = _concat_range_index(non_empties) else: - data = concat_columns([o._values for o in non_empties]) - result = Index(data) + data = concat_columns([o._column for o in non_empties]) + result = Index._from_column(data) names = {obj.name for obj in objs} if len(names) == 1: @@ -1491,7 +1494,7 @@ def __repr__(self): def __getitem__(self, index): res = self._get_elements_from_column(index) if isinstance(res, ColumnBase): - res = Index(res, name=self.name) + res = Index._from_column(res, name=self.name) return res @property # type: ignore @@ -1610,8 +1613,8 @@ def _clean_nulls_from_index(self): if isinstance(self, (DatetimeIndex, TimedeltaIndex)) else str(cudf.NA) ) - return cudf.Index( - self._values.astype("str").fillna(fill_value), + return cudf.Index._from_column( + self._column.astype("str").fillna(fill_value), name=self.name, ) @@ -1866,6 +1869,17 @@ def _from_data( result._freq = _validate_freq(freq) return result + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None + ) -> Self: + if column.dtype.kind != "M": + raise ValueError("column must have a datetime type.") + result = super()._from_column(column, name=name) + result._freq = _validate_freq(freq) + return result + def __getitem__(self, index): value = super().__getitem__(index) if cudf.get_option("mode.pandas_compatible") and isinstance( @@ -1923,8 +1937,8 @@ def strftime(self, date_format: str) -> Index: date_format : str Date format string (e.g. "%Y-%m-%d"). """ - return Index._from_data( - {self.name: self._column.strftime(date_format)} + return Index._from_column( + self._column.strftime(date_format), name=self.name ) @property @@ -1989,7 +2003,9 @@ def to_pydatetime(self) -> np.ndarray: return self.to_pandas().to_pydatetime() def to_julian_date(self) -> Index: - return Index._from_data({self.name: self._column.to_julian_date()}) + return Index._from_column( + self._column.to_julian_date(), name=self.name + ) def to_period(self, freq) -> pd.PeriodIndex: return self.to_pandas().to_period(freq=freq) @@ -2000,7 +2016,9 @@ def normalize(self) -> Self: Currently not implemented. """ - return type(self)._from_data({self.name: self._column.normalize()}) + return type(self)._from_column( + self._column.normalize(), name=self.name + ) @property def time(self) -> np.ndarray: @@ -2084,7 +2102,7 @@ def days_in_month(self) -> Index: """ Get the total number of days in the month that the date falls on. """ - return Index._from_data({self.name: self._column.days_in_month}) + return Index._from_column(self._column.days_in_month, name=self.name) daysinmonth = days_in_month @@ -2093,7 +2111,7 @@ def day_of_week(self) -> Index: """ Get the day of week that the date falls on. """ - return Index._from_data({self.name: self._column.day_of_week}) + return Index._from_column(self._column.day_of_week, name=self.name) @property # type: ignore @_performance_tracking @@ -2234,15 +2252,15 @@ def microsecond(self): >>> datetime_index.microsecond Index([0, 1, 2], dtype='int32') """ # noqa: E501 - return Index( + return Index._from_column( ( # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - self._values.get_dt_field("millisecond").astype("int32") + self._column.get_dt_field("millisecond").astype("int32") * cudf.Scalar(1000, dtype="int32") ) - + self._values.get_dt_field("microsecond"), + + self._column.get_dt_field("microsecond"), name=self.name, ) @@ -2374,7 +2392,7 @@ def is_leap_year(self) -> cupy.ndarray: ndarray Booleans indicating if dates belong to a leap year. """ - res = is_leap_year(self._values).fillna(False) + res = self._column.is_leap_year.fillna(False) return cupy.asarray(res) @property # type: ignore @@ -2400,8 +2418,7 @@ def quarter(self): >>> gIndex.quarter Index([2, 4], dtype='int8') """ - res = extract_quarter(self._values) - return Index(res, dtype="int8") + return Index._from_column(self._column.quarter.astype("int8")) @_performance_tracking def day_name(self, locale: str | None = None) -> Index: @@ -2423,7 +2440,7 @@ def day_name(self, locale: str | None = None) -> Index: dtype='object') """ day_names = self._column.get_day_names(locale) - return Index._from_data({self.name: day_names}) + return Index._from_column(day_names, name=self.name) @_performance_tracking def month_name(self, locale: str | None = None) -> Index: @@ -2442,7 +2459,7 @@ def month_name(self, locale: str | None = None) -> Index: Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object') """ month_names = self._column.get_month_names(locale) - return Index._from_data({self.name: month_names}) + return Index._from_column(month_names, name=self.name) @_performance_tracking def isocalendar(self) -> cudf.DataFrame: @@ -2481,14 +2498,14 @@ def to_pandas( @_performance_tracking def _get_dt_field(self, field: str) -> Index: """Return an Index of a numerical component of the DatetimeIndex.""" - out_column = self._values.get_dt_field(field) + out_column = self._column.get_dt_field(field) out_column = NumericalColumn( data=out_column.base_data, dtype=out_column.dtype, mask=out_column.base_mask, offset=out_column.offset, ) - return Index(out_column, name=self.name) + return Index._from_column(out_column, name=self.name) def _is_boolean(self): return False @@ -2522,9 +2539,7 @@ def ceil(self, freq): >>> gIndex.ceil("T") DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') """ # noqa: E501 - out_column = self._values.ceil(freq) - - return self.__class__._from_data({self.name: out_column}) + return type(self)._from_column(self._column.ceil(freq), name=self.name) @_performance_tracking def floor(self, freq): @@ -2555,9 +2570,9 @@ def floor(self, freq): >>> gIndex.floor("T") DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') """ # noqa: E501 - out_column = self._values.floor(freq) - - return self.__class__._from_data({self.name: out_column}) + return type(self)._from_column( + self._column.floor(freq), name=self.name + ) @_performance_tracking def round(self, freq): @@ -2595,9 +2610,9 @@ def round(self, freq): >>> dt_idx.round('T') DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') """ # noqa: E501 - out_column = self._values.round(freq) - - return self.__class__._from_data({self.name: out_column}) + return type(self)._from_column( + self._column.round(freq), name=self.name + ) def tz_localize( self, @@ -2647,8 +2662,8 @@ def tz_localize( to 'NaT'. """ # noqa: E501 result_col = self._column.tz_localize(tz, ambiguous, nonexistent) - return DatetimeIndex._from_data( - {self.name: result_col}, freq=self._freq + return DatetimeIndex._from_column( + result_col, name=self.name, freq=self._freq ) def tz_convert(self, tz: str | None): @@ -2684,7 +2699,7 @@ def tz_convert(self, tz: str | None): dtype='datetime64[ns, Europe/London]') """ # noqa: E501 result_col = self._column.tz_convert(tz) - return DatetimeIndex._from_data({self.name: result_col}) + return DatetimeIndex._from_column(result_col, name=self.name) def repeat(self, repeats, axis=None): res = super().repeat(repeats, axis=axis) @@ -2794,6 +2809,15 @@ def __init__( super().__init__(data, name=name) + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None + ) -> Self: + if column.dtype.kind != "m": + raise ValueError("column must have a timedelta type.") + return super()._from_column(column, name=name) + def __getitem__(self, index): value = super().__getitem__(index) if cudf.get_option("mode.pandas_compatible") and isinstance( @@ -2876,7 +2900,7 @@ def ceil(self, freq: str) -> Self: This method is currently not implemented. """ - return type(self)._from_data({self.name: self._column.ceil(freq)}) + return type(self)._from_column(self._column.ceil(freq), name=self.name) def floor(self, freq: str) -> Self: """ @@ -2884,7 +2908,9 @@ def floor(self, freq: str) -> Self: This method is currently not implemented. """ - return type(self)._from_data({self.name: self._column.floor(freq)}) + return type(self)._from_column( + self._column.floor(freq), name=self.name + ) def round(self, freq: str) -> Self: """ @@ -2892,41 +2918,51 @@ def round(self, freq: str) -> Self: This method is currently not implemented. """ - return type(self)._from_data({self.name: self._column.round(freq)}) + return type(self)._from_column( + self._column.round(freq), name=self.name + ) @property # type: ignore @_performance_tracking - def days(self): + def days(self) -> cudf.Index: """ Number of days for each element. """ # Need to specifically return `int64` to avoid overflow. - return Index(self._values.days, name=self.name, dtype="int64") + return Index._from_column( + self._column.days.astype("int64"), name=self.name + ) @property # type: ignore @_performance_tracking - def seconds(self): + def seconds(self) -> cudf.Index: """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return Index(self._values.seconds, name=self.name, dtype="int32") + return Index._from_column( + self._column.seconds.astype("int32"), name=self.name + ) @property # type: ignore @_performance_tracking - def microseconds(self): + def microseconds(self) -> cudf.Index: """ Number of microseconds (>= 0 and less than 1 second) for each element. """ - return Index(self._values.microseconds, name=self.name, dtype="int32") + return Index._from_column( + self._column.microseconds.astype("int32"), name=self.name + ) @property # type: ignore @_performance_tracking - def nanoseconds(self): + def nanoseconds(self) -> cudf.Index: """ Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ - return Index(self._values.nanoseconds, name=self.name, dtype="int32") + return Index._from_column( + self._column.nanoseconds.astype("int32"), name=self.name + ) @property # type: ignore @_performance_tracking @@ -3061,17 +3097,26 @@ def __init__( data = data.as_ordered(ordered=False) super().__init__(data, name=name) + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None + ) -> Self: + if not isinstance(column.dtype, cudf.CategoricalDtype): + raise ValueError("column must have a categorial type.") + return super()._from_column(column, name=name) + @property def ordered(self) -> bool: return self._column.ordered @property # type: ignore @_performance_tracking - def codes(self): + def codes(self) -> cudf.Index: """ The category codes of this categorical. """ - return Index(self._values.codes) + return Index._from_column(self._column.codes) @property # type: ignore @_performance_tracking @@ -3094,24 +3139,24 @@ def add_categories(self, new_categories) -> Self: `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. """ - return type(self)._from_data( - {self.name: self._column.add_categories(new_categories)} + return type(self)._from_column( + self._column.add_categories(new_categories), name=self.name ) def as_ordered(self) -> Self: """ Set the Categorical to be ordered. """ - return type(self)._from_data( - {self.name: self._column.as_ordered(ordered=True)} + return type(self)._from_column( + self._column.as_ordered(ordered=True), name=self.name ) def as_unordered(self) -> Self: """ Set the Categorical to be unordered. """ - return type(self)._from_data( - {self.name: self._column.as_ordered(ordered=False)} + return type(self)._from_column( + self._column.as_ordered(ordered=False), name=self.name ) def remove_categories(self, removals) -> Self: @@ -3125,8 +3170,8 @@ def remove_categories(self, removals) -> Self: removals : category or list of categories The categories which should be removed. """ - return type(self)._from_data( - {self.name: self._column.remove_categories(removals)} + return type(self)._from_column( + self._column.remove_categories(removals), name=self.name ) def remove_unused_categories(self) -> Self: @@ -3135,8 +3180,8 @@ def remove_unused_categories(self) -> Self: This method is currently not supported. """ - return type(self)._from_data( - {self.name: self._column.remove_unused_categories()} + return type(self)._from_column( + self._column.remove_unused_categories(), name=self.name ) def rename_categories(self, new_categories) -> Self: @@ -3145,8 +3190,8 @@ def rename_categories(self, new_categories) -> Self: This method is currently not supported. """ - return type(self)._from_data( - {self.name: self._column.rename_categories(new_categories)} + return type(self)._from_column( + self._column.rename_categories(new_categories), name=self.name ) def reorder_categories(self, new_categories, ordered=None) -> Self: @@ -3164,12 +3209,9 @@ def reorder_categories(self, new_categories, ordered=None) -> Self: Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. """ - return type(self)._from_data( - { - self.name: self._column.reorder_categories( - new_categories, ordered=ordered - ) - } + return type(self)._from_column( + self._column.reorder_categories(new_categories, ordered=ordered), + name=self.name, ) def set_categories( @@ -3191,12 +3233,11 @@ def set_categories( considered as a rename of the old categories or as reordered categories. """ - return type(self)._from_data( - { - self.name: self._column.set_categories( - new_categories, ordered=ordered, rename=rename - ) - } + return type(self)._from_column( + self._column.set_categories( + new_categories, ordered=ordered, rename=rename + ), + name=self.name, ) @@ -3411,6 +3452,15 @@ def __init__( def closed(self): return self.dtype.closed + @classmethod + @_performance_tracking + def _from_column( + cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None + ) -> Self: + if not isinstance(column.dtype, cudf.IntervalDtype): + raise ValueError("column must have a interval type.") + return super()._from_column(column, name=name) + @classmethod @_performance_tracking def from_breaks( @@ -3593,8 +3643,8 @@ def set_closed( Whether the intervals are closed on the left-side, right-side, both or neither. """ - return type(self)._from_data( - {self.name: self._column.set_closed(closed)} + return type(self)._from_column( + self._column.set_closed(closed), name=self.name ) def to_tuples(self, na_tuple: bool = True) -> pd.Index: @@ -3680,15 +3730,7 @@ def as_index( elif isinstance(arbitrary, BaseIndex): idx = arbitrary.copy(deep=copy).rename(name) elif isinstance(arbitrary, ColumnBase): - idx = _index_from_data({name: arbitrary}) - elif isinstance(arbitrary, cudf.Series): - return as_index( - arbitrary._column, - nan_as_null=nan_as_null, - copy=copy, - name=name, - dtype=dtype, - ) + raise ValueError("Use cudf.Index._from_column instead.") elif isinstance(arbitrary, (pd.RangeIndex, range)): idx = RangeIndex( start=arbitrary.start, @@ -3708,11 +3750,9 @@ def as_index( elif isinstance(arbitrary, cudf.DataFrame) or is_scalar(arbitrary): raise ValueError("Index data must be 1-dimensional and list-like") else: - return as_index( + return Index._from_column( column.as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null), - copy=copy, name=name, - dtype=dtype, ) if dtype is not None: idx = idx.astype(dtype) @@ -3749,7 +3789,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: elif step is None: # First non-empty index had only one element if obj.start == start: - result = Index(concat_columns([x._values for x in indexes])) + result = Index._from_column( + concat_columns([x._column for x in indexes]) + ) return result step = obj.start - start @@ -3757,7 +3799,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: next_ is not None and obj.start != next_ ) if non_consecutive: - result = Index(concat_columns([x._values for x in indexes])) + result = Index._from_column( + concat_columns([x._column for x in indexes]) + ) return result if step is not None: next_ = obj[-1] + step diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3b44a0f5864..8be9f0ad78e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -182,11 +182,16 @@ def _indices_from_labels(obj, labels): ) else: labels = labels.astype(obj.index.dtype) + idx_labels = cudf.Index._from_column(labels) + else: + idx_labels = labels # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" - lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels) + lhs = cudf.DataFrame( + {"__": as_column(range(len(idx_labels)))}, index=idx_labels + ) rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index) return lhs.join(rhs).sort_values(by=["__", "_"])["_"] @@ -6642,7 +6647,11 @@ def _drop_rows_by_labels( # 3. Use "leftanti" join to drop # TODO: use internal API with "leftanti" and specify left and right # join keys to bypass logic check - to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) + if isinstance(labels, ColumnBase): + join_index = cudf.Index._from_column(labels, name=level) + else: + join_index = cudf.Index(labels, name=level) + to_join = cudf.DataFrame(index=join_index) join_res = working_df.join(to_join, how="leftanti") # 4. Reconstruct original layout, and rename @@ -6669,12 +6678,11 @@ def _drop_rows_by_labels( if errors == "raise" and not labels.isin(obj.index).all(): raise KeyError("One or more values not found in axis") - key_df = cudf.DataFrame._from_data( - data={}, - index=cudf.Index( - labels, name=getattr(labels, "name", obj.index.name) - ), - ) + if isinstance(labels, ColumnBase): + idx = cudf.Index._from_column(labels, name=obj.index.name) + else: + idx = cudf.Index(labels, name=labels.name) + key_df = cudf.DataFrame._from_data(data={}, index=idx) if isinstance(obj, cudf.DataFrame): res = obj.join(key_df, how="leftanti") else: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ab88b191570..a66e2936e3b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -811,8 +811,9 @@ def _index_and_downcast(self, result, index, index_key): # it into an Index and name the final index values according # to that column's name. *_, last_column = index._data.columns - out_index = cudf.Index(last_column) - out_index.name = index.names[-1] + out_index = cudf.Index._from_column( + last_column, name=index.names[-1] + ) index = out_index elif out_index._num_columns > 1: # Otherwise pop the leftmost levels, names, and codes from the @@ -1061,7 +1062,7 @@ def get_level_values(self, level): raise KeyError(f"Level not found: '{level}'") else: level_idx = colnames.index(level) - level_values = cudf.Index( + level_values = cudf.Index._from_column( self._data[level], name=self.names[level_idx] ) return level_values diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 715bbf89b15..e0aee28bfeb 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -145,7 +145,9 @@ def copy(self, deep=True): def keys(self): index = super().keys if self._freq is not None and isinstance(index, cudf.DatetimeIndex): - return cudf.DatetimeIndex._from_data(index._data, freq=self._freq) + return cudf.DatetimeIndex._from_column( + index._column, name=index.name, freq=self._freq + ) return index def serialize(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 822b966364f..2fb4fde6552 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3245,8 +3245,8 @@ def value_counts( interval_col = IntervalColumn.from_struct_column( res.index._column._get_decategorized_column() ) - res.index = cudf.IntervalIndex._from_data( - {res.index.name: interval_col} + res.index = cudf.IntervalIndex._from_column( + interval_col, name=res.index.name ) res.name = result_name return res diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index c50a36b68b5..a92bf420147 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -18,7 +18,6 @@ ) from cudf.api.types import is_integer, is_scalar from cudf.core import column -from cudf.core.column_accessor import ColumnAccessor from cudf.core.index import ensure_index # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 @@ -288,8 +287,7 @@ def to_datetime( utc=utc, ) if isinstance(arg, (cudf.BaseIndex, pd.Index)): - ca = ColumnAccessor({arg.name: col}, verify=False) - return cudf.DatetimeIndex._from_data(ca) + return cudf.DatetimeIndex._from_column(col, name=arg.name) elif isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series._from_column( col, name=arg.name, index=ensure_index(arg.index) @@ -297,7 +295,7 @@ def to_datetime( elif is_scalar(arg): return col.element_indexing(0) else: - return cudf.Index(col) + return cudf.Index._from_column(col) except Exception as e: if errors == "raise": raise e @@ -900,7 +898,9 @@ def date_range( end = cudf.Scalar(end, dtype=dtype).value.astype("int64") arr = np.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") - return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz) + return cudf.DatetimeIndex._from_column(result, name=name).tz_localize( + tz + ) # The code logic below assumes `freq` is defined. It is first normalized # into `DateOffset` for further computation with timestamps. @@ -1001,9 +1001,9 @@ def date_range( "datetime64[ns]" ) - return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize( - tz - ) + return cudf.DatetimeIndex._from_column( + res, name=name, freq=freq + ).tz_localize(tz) def _has_fixed_frequency(freq: DateOffset) -> bool: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index c2072d90e98..31ad24a4664 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -398,8 +398,12 @@ def assert_index_equal( ) for level in range(left.nlevels): - llevel = cudf.Index(left._columns[level], name=left.names[level]) - rlevel = cudf.Index(right._columns[level], name=right.names[level]) + llevel = cudf.Index._from_column( + left._columns[level], name=left.names[level] + ) + rlevel = cudf.Index._from_column( + right._columns[level], name=right.names[level] + ) mul_obj = f"MultiIndex level [{level}]" assert_index_equal( llevel, diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a68f4574da3..b1e095e8853 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -167,7 +167,9 @@ def test_string_index(): pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name") + stringIndex = cudf.Index._from_column( + as_column(["a", "b", "c", "d", "e"]), name="name" + ) pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 30880f074c0..cc88cc79769 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1092,7 +1092,7 @@ def test_string_index(): pdf.index = stringIndex.to_pandas() gdf.index = stringIndex assert_eq(pdf, gdf) - stringIndex = cudf.Index( + stringIndex = cudf.Index._from_column( cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name" ) pdf.index = stringIndex.to_pandas()