Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reindex Improvements #10815

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 94 additions & 108 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,7 @@ def __init__(
if columns is not None:
self._data = data._data
self._reindex(
columns=columns, index=index, deep=False, inplace=True
column_names=columns, index=index, deep=False, inplace=True
)
else:
self._data = data._data
Expand Down Expand Up @@ -2180,129 +2180,114 @@ def _set_column_names_like(self, other):
other._data.names, other._data.multiindex, other._data.level_names
)

@_cudf_nvtx_annotate
def _reindex(
self, columns, dtypes=None, deep=False, index=None, inplace=False
):
"""
Helper for `.reindex`

Parameters
----------
columns : array-like
The list of columns to select from the Frame,
if ``columns`` is a superset of ``Frame.columns`` new
columns are created.
dtypes : dict
Mapping of dtypes for the empty columns being created.
deep : boolean, optional, default False
Whether to make deep copy or shallow copy of the columns.
index : Index or array-like, default None
The ``index`` to be used to reindex the Frame with.
inplace : bool, default False
Whether to perform the operation in place on the data.

Returns
-------
DataFrame
"""
if dtypes is None:
dtypes = {}

df = self
if index is not None:
index = cudf.core.index.as_index(index)

idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
left_dtype == right_dtype
for left_dtype, right_dtype in zip(
(col.dtype for col in df.index._data.columns),
(col.dtype for col in index._data.columns),
)
)

if not idx_dtype_match:
columns = (
columns if columns is not None else list(df._column_names)
)
df = DataFrame()
else:
df = DataFrame(None, index).join(df, how="left", sort=True)
# double-argsort to map back from sorted to unsorted positions
df = df.take(index.argsort(ascending=True).argsort())

index = index if index is not None else df.index
names = columns if columns is not None else list(df._data.names)
cols = {
name: (
df._data[name].copy(deep=deep)
if name in df._data
else column_empty(
dtype=dtypes.get(name, np.float64),
masked=True,
row_count=len(index),
)
)
for name in names
}

result = self.__class__._from_data(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
),
index=index,
)

return self._mimic_inplace(result, inplace=inplace)

@_cudf_nvtx_annotate
def reindex(
self, labels=None, axis=None, index=None, columns=None, copy=True
self,
labels=None,
index=None,
columns=None,
axis=None,
method=None,
copy=True,
level=None,
fill_value=NA,
limit=None,
tolerance=None,
):
"""
Return a new DataFrame whose axes conform to a new index

``DataFrame.reindex`` supports two calling conventions:
- ``(index=index_labels, columns=column_names)``
- ``(labels, axis={0 or 'index', 1 or 'columns'})``
Conform DataFrame to new index. Places NA/NaN in locations
having no value in the previous index. A new object is produced
unless the new index is equivalent to the current one and copy=False.

Parameters
----------
labels : Index, Series-convertible, optional, default None
axis : {0 or 'index', 1 or 'columns'}, optional, default 0
New labels / index to conform the axis specified by ``axis`` to.
index : Index, Series-convertible, optional, default None
Shorthand for ``df.reindex(labels=index_labels, axis=0)``
The index labels specifying the index to conform to.
columns : array-like, optional, default None
Shorthand for ``df.reindex(labels=column_names, axis=1)``
copy : boolean, optional, default True
The column labels specifying the columns to conform to.
axis : Axis to target.
Can be either the axis name
(``index``, ``columns``) or number (0, 1).
method : Not supported
copy : boolean, default True
Return a new object, even if the passed indexes are the same.
level : Not supported
fill_value : Value to use for missing values.
Defaults to ``NA``, but can be any “compatible” value.
limit : Not supported
tolerance : Not supported

Returns
-------
A DataFrame whose axes conform to the new index(es)
DataFrame with changed index.

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame()
>>> df['key'] = [0, 1, 2, 3, 4]
>>> df['val'] = [float(i + 10) for i in range(5)]
>>> df_new = df.reindex(index=[0, 3, 4, 5],
... columns=['key', 'val', 'sum'])
``DataFrame.reindex`` supports two calling conventions
* ``(index=index_labels, columns=column_labels, ...)``
* ``(labels, axis={'index', 'columns'}, ...)``
We _highly_ recommend using keyword arguments to clarify your intent.

Create a dataframe with some fictional data.
>>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
>>> df = cudf.DataFrame({'http_status': [200, 200, 404, 404, 301],
... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
... index=index)
>>> df
key val
0 0 10.0
1 1 11.0
2 2 12.0
3 3 13.0
4 4 14.0
>>> df_new
key val sum
0 0 10.0 <NA>
3 3 13.0 <NA>
4 4 14.0 <NA>
5 <NA> <NA> <NA>
http_status response_time
Firefox 200 0.04
Chrome 200 0.02
Safari 404 0.07
IE10 404 0.08
Konqueror 301 1.00
>>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
... 'Chrome']
>>> df.reindex(new_index)
http_status response_time
Safari 404 0.07
Iceweasel <NA> <NA>
Comodo Dragon <NA> <NA>
IE10 404 0.08
Chrome 200 0.02

.. pandas-compat::
**DataFrame.reindex**

Note: One difference from Pandas is that ``NA`` is used for rows
that do not match, rather than ``NaN``. One side effect of this is
that the column ``http_status`` retains an integer dtype in cuDF
where it is cast to float in Pandas.

We can fill in the missing values by
passing a value to the keyword ``fill_value``.

>>> df.reindex(new_index, fill_value=0)
http_status response_time
Safari 404 0.07
Iceweasel 0 0.00
Comodo Dragon 0 0.00
IE10 404 0.08
Chrome 200 0.02

We can also reindex the columns.
>>> df.reindex(columns=['http_status', 'user_agent'])
http_status user_agent
Firefox 200 <NA>
Chrome 200 <NA>
Safari 404 <NA>
IE10 404 <NA>
Konqueror 301 <NA>

Or we can use “axis-style” keyword arguments
>>> df.reindex(columns=['http_status', 'user_agent'])
http_status user_agent
Firefox 200 <NA>
Chrome 200 <NA>
Safari 404 <NA>
IE10 404 <NA>
Konqueror 301 <NA>
"""

if labels is None and index is None and columns is None:
Expand All @@ -2329,11 +2314,12 @@ def reindex(
)

return df._reindex(
columns=columns,
column_names=columns,
dtypes=self._dtypes,
deep=copy,
index=index,
inplace=False,
fill_value=fill_value,
)

@_cudf_nvtx_annotate
Expand Down
98 changes: 98 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.index import Index, RangeIndex, _index_from_columns
from cudf.core.missing import NA
from cudf.core.multiindex import MultiIndex
from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
from cudf.utils import docutils
Expand Down Expand Up @@ -1288,6 +1289,103 @@ def _align_to_index(

return result

@_cudf_nvtx_annotate
def _reindex(
self,
column_names,
dtypes=None,
deep=False,
index=None,
inplace=False,
fill_value=NA,
):
"""
Helper for `.reindex`

Parameters
----------
columns_names : array-like
The list of columns to select from the Frame,
if ``columns`` is a superset of ``Frame.columns`` new
columns are created.
dtypes : dict
Mapping of dtypes for the empty columns being created.
deep : boolean, optional, default False
Whether to make deep copy or shallow copy of the columns.
index : Index or array-like, default None
The ``index`` to be used to reindex the Frame with.
inplace : bool, default False
Whether to perform the operation in place on the data.
fill_value : value with which to replace nulls in the result

Returns
-------
Series or DataFrame
"""
if dtypes is None:
dtypes = {}

df = self
if index is not None:
index = cudf.core.index.as_index(index)

idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
left_dtype == right_dtype
for left_dtype, right_dtype in zip(
(col.dtype for col in df.index._data.columns),
(col.dtype for col in index._data.columns),
isVoid marked this conversation as resolved.
Show resolved Hide resolved
)
)

if not idx_dtype_match:
column_names = (
column_names
if column_names is not None
else list(df._column_names)
)
df = cudf.DataFrame()
else:
lhs = cudf.DataFrame._from_data({}, index=index)
rhs = cudf.DataFrame._from_data(
{
# bookkeeping workaround for unnamed series
name or 0: col
for name, col in df._data.items()
},
index=df._index,
)
df = lhs.join(rhs, how="left", sort=True)
# double-argsort to map back from sorted to unsorted positions
df = df.take(index.argsort(ascending=True).argsort())

index = index if index is not None else df.index
names = (
column_names if column_names is not None else list(df._data.names)
)
cols = {
name: (
df._data[name].copy(deep=deep)
if name in df._data
else cudf.core.column.column.column_empty(
dtype=dtypes.get(name, np.float64),
masked=True,
row_count=len(index),
)
)
for name in names
}
result = self.__class__._from_data(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
),
index=index,
)

result.fillna(fill_value, inplace=True)
isVoid marked this conversation as resolved.
Show resolved Hide resolved
return self._mimic_inplace(result, inplace=inplace)

def round(self, decimals=0, how="half_even"):
"""
Round to a variable number of decimal places.
Expand Down
Loading