Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement support for retaining Pandas index #6061

Merged
merged 48 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
82a61e1
Implement support for retaining Pandas index
philippjfr Jan 6, 2024
6915ad5
Column takes precedence over index
philippjfr Jan 6, 2024
a5a2772
Small fix
philippjfr Jan 6, 2024
96b69e3
Further fixes
philippjfr Jan 6, 2024
f8e1d48
Fix ibis
philippjfr Jan 6, 2024
feaba77
Merge branch 'main' into pandas_index_support
hoxbro Feb 1, 2024
5df94ad
Remove xfail for downsample test
hoxbro Feb 1, 2024
9db1b27
Merge branch 'main' into pandas_index_support
hoxbro Feb 8, 2024
b3398af
Rename is_index to isindex to match with isscalar
hoxbro Feb 8, 2024
e2ed038
Small updates
hoxbro Feb 8, 2024
1adc533
Update aggregate to work with MultiIndex
hoxbro Feb 14, 2024
f4ad81f
Try commenting out reset_index
hoxbro Feb 14, 2024
cf5cf3a
Add select test and clean up
hoxbro Feb 14, 2024
f522fdb
iloc with scalar values + fix
hoxbro Feb 15, 2024
4a2e368
Update iloc to work with slice of indexes
hoxbro Feb 15, 2024
3cb074f
Handle iloc scalar and slice
hoxbro Feb 15, 2024
8e51d8a
Hvplot fixes
hoxbro Feb 15, 2024
03af717
Add test case for pandas range index
hoxbro Feb 15, 2024
ab1f2c2
Add last tests and updates for iloc
hoxbro Feb 21, 2024
77d855f
Add sort test
hoxbro Feb 22, 2024
d53e70e
Add out of bounds check
hoxbro Feb 22, 2024
b38d4da
Add select test
hoxbro Feb 22, 2024
10ff6f4
Fix copy paste error
hoxbro Feb 22, 2024
cdc634e
Merge branch 'main' into pandas_index_support
hoxbro Apr 4, 2024
287fd3a
Add sample test
hoxbro Apr 4, 2024
b4fa999
Add test for values
hoxbro Apr 4, 2024
fa5898f
Add reindex
hoxbro Apr 4, 2024
40d313c
Redo reindex which breaks hvplot
hoxbro Apr 4, 2024
e6fe458
Remove commented out code
hoxbro Apr 4, 2024
e117f2a
Check if data has an index attribute
hoxbro Apr 5, 2024
3d44737
Add tmp pin on panel
hoxbro Apr 5, 2024
6b97541
Add unit test to groupby
hoxbro Apr 5, 2024
5ec0ec3
Fast code select if only indexes
hoxbro Apr 8, 2024
526c188
Clean up
hoxbro Apr 8, 2024
36c2e09
Update holoviews/core/data/pandas.py
hoxbro Apr 17, 2024
e585deb
Remove comment
hoxbro Apr 17, 2024
7b83b10
Account for tuples in fast path
hoxbro Apr 17, 2024
c53320f
Always use index selection if available
philippjfr Apr 17, 2024
24f0f5e
Do not shadow index column
philippjfr Apr 17, 2024
2b68d45
Only use loc for multi-index
philippjfr Apr 17, 2024
b41d983
Updating with failing tests
hoxbro Apr 19, 2024
2ea4fd0
Fix indexing tests
philippjfr Apr 19, 2024
201527d
Apply suggestions from code review
philippjfr Apr 19, 2024
4546212
Simplify depth check
philippjfr Apr 19, 2024
d02f6ab
Index up to the point it is valid
philippjfr Apr 19, 2024
ffeffe1
Add test to check lexsort_depth import stays valid
philippjfr Apr 19, 2024
b0aee28
Fix import
philippjfr Apr 19, 2024
1476fac
Wrap inside of except
hoxbro Apr 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions holoviews/core/data/ibis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from .. import util
from ..element import Element
from ..ndmapping import NdMapping, item_check, sorted_context
from . import pandas
from .interface import Interface
from .interface import DataError, Interface
from .util import cached


Expand Down Expand Up @@ -94,6 +93,17 @@ def init(cls, eltype, data, keys, values):
values = list(data.columns[: nvdim if nvdim else None])
return data, dict(kdims=keys, vdims=values), {}

@classmethod
def validate(cls, dataset, vdims=True):
dim_types = 'all' if vdims else 'key'
dimensions = dataset.dimensions(dim_types, label='name')
cols = list(dataset.data.columns)
not_found = [d for d in dimensions if d not in cols]
if not_found:
raise DataError("Supplied data does not contain specified "
"dimensions, the following dimensions were "
"not found: %s" % repr(not_found), cls)

@classmethod
def compute(cls, dataset):
return dataset.clone(dataset.data.execute())
Expand Down Expand Up @@ -216,8 +226,9 @@ def redim(cls, dataset, dimensions):
**{v.name: dataset.data[k] for k, v in dimensions.items()}
)

validate = pandas.PandasInterface.validate
reindex = pandas.PandasInterface.reindex
@classmethod
def reindex(cls, dataset, kdims=None, vdims=None):
return dataset.data

@classmethod
def _index_ibis_table(cls, data):
Expand Down
194 changes: 144 additions & 50 deletions holoviews/core/data/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ class PandasInterface(Interface, PandasAPI):

@classmethod
def dimension_type(cls, dataset, dim):
name = dataset.get_dimension(dim, strict=True).name
idx = list(dataset.data.columns).index(name)
return dataset.data.dtypes.iloc[idx].type
return cls.dtype(dataset, dim).type

@classmethod
def init(cls, eltype, data, kdims, vdims):
Expand All @@ -46,9 +44,7 @@ def init(cls, eltype, data, kdims, vdims):
data = data.to_frame(name=name)
if util.is_dataframe(data):
ncols = len(data.columns)
index_names = data.index.names if isinstance(data, pd.DataFrame) else [data.index.name]
if index_names == [None]:
index_names = ['index']
index_names = cls.indexes(data)
if eltype._auto_indexable_1d and ncols == 1 and kdims is None:
kdims = list(index_names)

Expand All @@ -74,17 +70,7 @@ def init(cls, eltype, data, kdims, vdims):
"Having a non-string as a column name in a DataFrame is not supported."
)

# Handle reset of index if kdims reference index by name
for kd in kdims:
kd = dimension_name(kd)
if kd in data.columns:
continue
if any(kd == ('index' if name is None else name)
for name in index_names):
data = data.reset_index()
break

if kdims:
if kdims and not (len(kdims) == len(index_names) and {dimension_name(kd) for kd in kdims} == set(index_names)):
kdim = dimension_name(kdims[0])
if eltype._auto_indexable_1d and ncols == 1 and kdim not in data.columns:
data = data.copy()
Expand Down Expand Up @@ -147,31 +133,67 @@ def init(cls, eltype, data, kdims, vdims):
raise ValueError('PandasInterface could not find specified dimensions in the data.')
else:
data = pd.DataFrame(data, columns=columns)
return data, {'kdims':kdims, 'vdims':vdims}, {}

return data, {'kdims': kdims, 'vdims': vdims}, {}

@classmethod
def isscalar(cls, dataset, dim):
name = dataset.get_dimension(dim, strict=True).name
return len(dataset.data[name].unique()) == 1

@classmethod
def dtype(cls, dataset, dimension):
dim = dataset.get_dimension(dimension, strict=True)
name = dim.name
df = dataset.data
if cls.isindex(dataset, dim):
data = cls.index_values(dataset, dim)
else:
data = df[name]
if util.isscalar(data):
return np.array([data]).dtype
else:
return data.dtype

@classmethod
def indexes(cls, data):
index_names = data.index.names if isinstance(data, pd.DataFrame) else [data.index.name]
if index_names == [None]:
index_names = ['_index'] if 'index' in data.columns else ['index']
return index_names

@classmethod
def isindex(cls, dataset, dimension):
dimension = dataset.get_dimension(dimension, strict=True)
if dimension.name in dataset.data.columns:
return False
return dimension.name in cls.indexes(dataset.data)

@classmethod
def index_values(cls, dataset, dimension):
dimension = dataset.get_dimension(dimension, strict=True)
index = dataset.data.index
if isinstance(index, pd.MultiIndex):
return index.get_level_values(dimension.name)
return index

@classmethod
def validate(cls, dataset, vdims=True):
dim_types = 'all' if vdims else 'key'
dimensions = dataset.dimensions(dim_types, label='name')
cols = list(dataset.data.columns)
cols = list(dataset.data.columns) + cls.indexes(dataset.data)
not_found = [d for d in dimensions if d not in cols]
if not_found:
raise DataError("Supplied data does not contain specified "
"dimensions, the following dimensions were "
"not found: %s" % repr(not_found), cls)


@classmethod
def range(cls, dataset, dimension):
dimension = dataset.get_dimension(dimension, strict=True)
column = dataset.data[dimension.name]
if cls.isindex(dataset, dimension):
column = cls.index_values(dataset, dimension)
else:
column = dataset.data[dimension.name]
if column.dtype.kind == 'O':
if (not isinstance(dataset.data, pd.DataFrame) or
util.pandas_version < Version('0.17.0')):
Expand All @@ -184,6 +206,8 @@ def range(cls, dataset, dimension):
pass
if not len(column):
return np.nan, np.nan
if isinstance(column, pd.Index):
return column[0], column[-1]
return column.iloc[0], column.iloc[-1]
else:
if dimension.nodata is not None:
Expand Down Expand Up @@ -246,10 +270,9 @@ def groupby(cls, dataset, dimensions, container_type, group_type, **kwargs):

@classmethod
def aggregate(cls, dataset, dimensions, function, **kwargs):
data = dataset.data
cols = [d.name for d in dataset.kdims if d in dimensions]
vdims = dataset.dimensions('value', label='name')
reindexed = data[cols+vdims]
reindexed = cls.dframe(dataset, dimensions=cols+vdims)
if function in [np.std, np.var]:
# Fix for consistency with other backend
# pandas uses ddof=1 for std and var
Expand Down Expand Up @@ -298,9 +321,11 @@ def unpack_scalar(cls, dataset, data):

@classmethod
def reindex(cls, dataset, kdims=None, vdims=None):
# DataFrame based tables don't need to be reindexed
return dataset.data

data = dataset.data
if isinstance(data.index, pd.MultiIndex):
kdims = [kdims] if isinstance(kdims, (str, Dimension)) else kdims
data = data.reset_index().set_index(list(map(str, kdims)), drop=True)
return data

@classmethod
def mask(cls, dataset, mask, mask_value=np.nan):
Expand All @@ -309,7 +334,6 @@ def mask(cls, dataset, mask, mask_value=np.nan):
masked.loc[mask, cols] = mask_value
return masked


@classmethod
def redim(cls, dataset, dimensions):
column_renames = {k: v.name for k, v in dimensions.items()}
Expand All @@ -327,39 +351,94 @@ def sort(cls, dataset, by=None, reverse=False):
return dataset.data.sort(columns=cols, ascending=not reverse)
return dataset.data.sort_values(by=cols, ascending=not reverse)

@classmethod
def sorted_index(cls, df):
if hasattr(df.index, 'is_lexsorted'):
return df.index.is_lexsorted()
return df.index.is_monotonic_increasing

@classmethod
def sort_depth(cls, df):
try:
from pandas.core.indexes.multi import _lexsort_depth
return _lexsort_depth(df.index.codes, df.index.nlevels)
except (ImportError, AttributeError):
return 0

@classmethod
def index_selection(cls, df, selection):
indexes = cls.indexes(df)
nindex = len(indexes)
sorted_index = cls.sorted_index(df)
if sorted_index:
depth = df.index.nlevels
else:
depth = cls.sort_depth(df)
index_sel = {}
skip_index = True
for level, idx in enumerate(indexes):
if idx not in selection:
index_sel[idx] = slice(None, None)
continue
skip_index = False
sel = selection[idx]
if isinstance(sel, tuple) and len(sel) < 4:
sel = slice(*sel)
elif not isinstance(sel, (list, slice)):
sel = [sel]
if isinstance(sel, slice) and nindex > 1 and not sorted_index and level>depth:
# If the index is not monotonic we cannot slice
# so return indexer up to the point it is valid
return index_sel
index_sel[idx] = sel
return {} if skip_index else index_sel

@classmethod
def select(cls, dataset, selection_mask=None, **selection):
df = dataset.data
if selection_mask is None:
selection_mask = cls.select_mask(dataset, selection)
if index_sel:= cls.index_selection(df, selection):
try:
if len(index_sel) == 1:
df = df[next(iter(index_sel.values()))]
else:
df = df.loc[tuple(index_sel.values()), :]
except KeyError:
# If index lookup fails we fall back to boolean indexing
index_sel = {}
column_sel = {k: v for k, v in selection.items() if k not in index_sel}
if column_sel:
selection_mask = cls.select_mask(dataset, column_sel)

indexed = cls.indexed(dataset, selection)
if isinstance(selection_mask, pd.Series):
df = df[selection_mask]
else:
elif selection_mask is not None:
df = df.iloc[selection_mask]
if indexed and len(df) == 1 and len(dataset.vdims) == 1:
return df[dataset.vdims[0].name].iloc[0]
return df


@classmethod
def values(
cls,
dataset,
dim,
expanded=True,
flat=True,
compute=True,
keep_index=False,
cls,
dataset,
dim,
expanded=True,
flat=True,
compute=True,
keep_index=False,
):
dim = dataset.get_dimension(dim, strict=True)
data = dataset.data[dim.name]
isindex = cls.isindex(dataset, dim)
if isindex:
data = cls.index_values(dataset, dim)
philippjfr marked this conversation as resolved.
Show resolved Hide resolved
else:
data = dataset.data[dim.name]
if keep_index:
return data
if data.dtype.kind == 'M' and getattr(data.dtype, 'tz', None):
data = data.dt.tz_localize(None)
data = (data if isindex else data.dt).tz_localize(None)
if not expanded:
return pd.unique(data)
return data.values if hasattr(data, 'values') else data
Expand Down Expand Up @@ -405,38 +484,53 @@ def as_dframe(cls, dataset):
if it already a dataframe type.
"""
if issubclass(dataset.interface, PandasInterface):
if any(cls.isindex(dataset, dim) for dim in dataset.dimensions()):
return dataset.data.reset_index()
return dataset.data
else:
return dataset.dframe()


@classmethod
def dframe(cls, dataset, dimensions):
data = dataset.data
if dimensions:
return dataset.data[dimensions]
if any(cls.isindex(dataset, d) for d in dimensions):
data = data.reset_index()
return data[dimensions]
else:
return dataset.data.copy()

return data.copy()

@classmethod
def iloc(cls, dataset, index):
rows, cols = index
scalar = False
columns = list(dataset.data.columns)
if isinstance(cols, slice):
cols = [d.name for d in dataset.dimensions()][cols]
elif np.isscalar(cols):
scalar = np.isscalar(rows)
cols = [dataset.get_dimension(cols).name]
dim = dataset.get_dimension(cols)
if dim is None:
raise ValueError('column is out of bounds')
cols = [dim.name]
else:
cols = [dataset.get_dimension(d).name for d in index[1]]
cols = [columns.index(c) for c in cols]
cols = [dataset.get_dimension(d).name for d in cols]
if np.isscalar(rows):
rows = [rows]

data = dataset.data
indexes = cls.indexes(data)
columns = list(data.columns)
id_cols = [columns.index(c) for c in cols if c not in indexes]
if not id_cols:
if len(indexes) > 1:
data = data.index.to_frame()[cols].iloc[rows].reset_index(drop=True)
data = data.values.ravel()[0] if scalar else data
else:
data = data.index.values[rows[0]] if scalar else data.index[rows]
return data
if scalar:
return dataset.data.iloc[rows[0], cols[0]]
return dataset.data.iloc[rows, cols]
return data.iloc[rows[0], id_cols[0]]
return data.iloc[rows, id_cols]


Interface.register(PandasInterface)
Loading
Loading