diff --git a/holoviews/core/data/ibis.py b/holoviews/core/data/ibis.py index f904995d07..bc1268a687 100644 --- a/holoviews/core/data/ibis.py +++ b/holoviews/core/data/ibis.py @@ -8,8 +8,7 @@ from .. import util from ..element import Element from ..ndmapping import NdMapping, item_check, sorted_context -from . import pandas -from .interface import Interface +from .interface import DataError, Interface from .util import cached @@ -94,6 +93,17 @@ def init(cls, eltype, data, keys, values): values = list(data.columns[: nvdim if nvdim else None]) return data, dict(kdims=keys, vdims=values), {} + @classmethod + def validate(cls, dataset, vdims=True): + dim_types = 'all' if vdims else 'key' + dimensions = dataset.dimensions(dim_types, label='name') + cols = list(dataset.data.columns) + not_found = [d for d in dimensions if d not in cols] + if not_found: + raise DataError("Supplied data does not contain specified " + "dimensions, the following dimensions were " + "not found: %s" % repr(not_found), cls) + @classmethod def compute(cls, dataset): return dataset.clone(dataset.data.execute()) @@ -216,8 +226,9 @@ def redim(cls, dataset, dimensions): **{v.name: dataset.data[k] for k, v in dimensions.items()} ) - validate = pandas.PandasInterface.validate - reindex = pandas.PandasInterface.reindex + @classmethod + def reindex(cls, dataset, kdims=None, vdims=None): + return dataset.data @classmethod def _index_ibis_table(cls, data): diff --git a/holoviews/core/data/pandas.py b/holoviews/core/data/pandas.py index ce7b2fc086..2292e084b7 100644 --- a/holoviews/core/data/pandas.py +++ b/holoviews/core/data/pandas.py @@ -32,9 +32,7 @@ class PandasInterface(Interface, PandasAPI): @classmethod def dimension_type(cls, dataset, dim): - name = dataset.get_dimension(dim, strict=True).name - idx = list(dataset.data.columns).index(name) - return dataset.data.dtypes.iloc[idx].type + return cls.dtype(dataset, dim).type @classmethod def init(cls, eltype, data, kdims, vdims): @@ -46,9 +44,7 @@ def init(cls, eltype, data, kdims, vdims): data = data.to_frame(name=name) if util.is_dataframe(data): ncols = len(data.columns) - index_names = data.index.names if isinstance(data, pd.DataFrame) else [data.index.name] - if index_names == [None]: - index_names = ['index'] + index_names = cls.indexes(data) if eltype._auto_indexable_1d and ncols == 1 and kdims is None: kdims = list(index_names) @@ -74,17 +70,7 @@ def init(cls, eltype, data, kdims, vdims): "Having a non-string as a column name in a DataFrame is not supported." ) - # Handle reset of index if kdims reference index by name - for kd in kdims: - kd = dimension_name(kd) - if kd in data.columns: - continue - if any(kd == ('index' if name is None else name) - for name in index_names): - data = data.reset_index() - break - - if kdims: + if kdims and not (len(kdims) == len(index_names) and {dimension_name(kd) for kd in kdims} == set(index_names)): kdim = dimension_name(kdims[0]) if eltype._auto_indexable_1d and ncols == 1 and kdim not in data.columns: data = data.copy() @@ -147,31 +133,67 @@ def init(cls, eltype, data, kdims, vdims): raise ValueError('PandasInterface could not find specified dimensions in the data.') else: data = pd.DataFrame(data, columns=columns) - return data, {'kdims':kdims, 'vdims':vdims}, {} - + return data, {'kdims': kdims, 'vdims': vdims}, {} @classmethod def isscalar(cls, dataset, dim): name = dataset.get_dimension(dim, strict=True).name return len(dataset.data[name].unique()) == 1 + @classmethod + def dtype(cls, dataset, dimension): + dim = dataset.get_dimension(dimension, strict=True) + name = dim.name + df = dataset.data + if cls.isindex(dataset, dim): + data = cls.index_values(dataset, dim) + else: + data = df[name] + if util.isscalar(data): + return np.array([data]).dtype + else: + return data.dtype + + @classmethod + def indexes(cls, data): + index_names = data.index.names if isinstance(data, pd.DataFrame) else [data.index.name] + if index_names == [None]: + index_names = ['_index'] if 'index' in data.columns else ['index'] + return index_names + + @classmethod + def isindex(cls, dataset, dimension): + dimension = dataset.get_dimension(dimension, strict=True) + if dimension.name in dataset.data.columns: + return False + return dimension.name in cls.indexes(dataset.data) + + @classmethod + def index_values(cls, dataset, dimension): + dimension = dataset.get_dimension(dimension, strict=True) + index = dataset.data.index + if isinstance(index, pd.MultiIndex): + return index.get_level_values(dimension.name) + return index @classmethod def validate(cls, dataset, vdims=True): dim_types = 'all' if vdims else 'key' dimensions = dataset.dimensions(dim_types, label='name') - cols = list(dataset.data.columns) + cols = list(dataset.data.columns) + cls.indexes(dataset.data) not_found = [d for d in dimensions if d not in cols] if not_found: raise DataError("Supplied data does not contain specified " "dimensions, the following dimensions were " "not found: %s" % repr(not_found), cls) - @classmethod def range(cls, dataset, dimension): dimension = dataset.get_dimension(dimension, strict=True) - column = dataset.data[dimension.name] + if cls.isindex(dataset, dimension): + column = cls.index_values(dataset, dimension) + else: + column = dataset.data[dimension.name] if column.dtype.kind == 'O': if (not isinstance(dataset.data, pd.DataFrame) or util.pandas_version < Version('0.17.0')): @@ -184,6 +206,8 @@ def range(cls, dataset, dimension): pass if not len(column): return np.nan, np.nan + if isinstance(column, pd.Index): + return column[0], column[-1] return column.iloc[0], column.iloc[-1] else: if dimension.nodata is not None: @@ -246,10 +270,9 @@ def groupby(cls, dataset, dimensions, container_type, group_type, **kwargs): @classmethod def aggregate(cls, dataset, dimensions, function, **kwargs): - data = dataset.data cols = [d.name for d in dataset.kdims if d in dimensions] vdims = dataset.dimensions('value', label='name') - reindexed = data[cols+vdims] + reindexed = cls.dframe(dataset, dimensions=cols+vdims) if function in [np.std, np.var]: # Fix for consistency with other backend # pandas uses ddof=1 for std and var @@ -298,9 +321,11 @@ def unpack_scalar(cls, dataset, data): @classmethod def reindex(cls, dataset, kdims=None, vdims=None): - # DataFrame based tables don't need to be reindexed - return dataset.data - + data = dataset.data + if isinstance(data.index, pd.MultiIndex): + kdims = [kdims] if isinstance(kdims, (str, Dimension)) else kdims + data = data.reset_index().set_index(list(map(str, kdims)), drop=True) + return data @classmethod def mask(cls, dataset, mask, mask_value=np.nan): @@ -309,7 +334,6 @@ def mask(cls, dataset, mask, mask_value=np.nan): masked.loc[mask, cols] = mask_value return masked - @classmethod def redim(cls, dataset, dimensions): column_renames = {k: v.name for k, v in dimensions.items()} @@ -327,39 +351,94 @@ def sort(cls, dataset, by=None, reverse=False): return dataset.data.sort(columns=cols, ascending=not reverse) return dataset.data.sort_values(by=cols, ascending=not reverse) + @classmethod + def sorted_index(cls, df): + if hasattr(df.index, 'is_lexsorted'): + return df.index.is_lexsorted() + return df.index.is_monotonic_increasing + + @classmethod + def sort_depth(cls, df): + try: + from pandas.core.indexes.multi import _lexsort_depth + return _lexsort_depth(df.index.codes, df.index.nlevels) + except (ImportError, AttributeError): + return 0 + + @classmethod + def index_selection(cls, df, selection): + indexes = cls.indexes(df) + nindex = len(indexes) + sorted_index = cls.sorted_index(df) + if sorted_index: + depth = df.index.nlevels + else: + depth = cls.sort_depth(df) + index_sel = {} + skip_index = True + for level, idx in enumerate(indexes): + if idx not in selection: + index_sel[idx] = slice(None, None) + continue + skip_index = False + sel = selection[idx] + if isinstance(sel, tuple) and len(sel) < 4: + sel = slice(*sel) + elif not isinstance(sel, (list, slice)): + sel = [sel] + if isinstance(sel, slice) and nindex > 1 and not sorted_index and level>depth: + # If the index is not monotonic we cannot slice + # so return indexer up to the point it is valid + return index_sel + index_sel[idx] = sel + return {} if skip_index else index_sel @classmethod def select(cls, dataset, selection_mask=None, **selection): df = dataset.data if selection_mask is None: - selection_mask = cls.select_mask(dataset, selection) + if index_sel:= cls.index_selection(df, selection): + try: + if len(index_sel) == 1: + df = df[next(iter(index_sel.values()))] + else: + df = df.loc[tuple(index_sel.values()), :] + except KeyError: + # If index lookup fails we fall back to boolean indexing + index_sel = {} + column_sel = {k: v for k, v in selection.items() if k not in index_sel} + if column_sel: + selection_mask = cls.select_mask(dataset, column_sel) indexed = cls.indexed(dataset, selection) if isinstance(selection_mask, pd.Series): df = df[selection_mask] - else: + elif selection_mask is not None: df = df.iloc[selection_mask] if indexed and len(df) == 1 and len(dataset.vdims) == 1: return df[dataset.vdims[0].name].iloc[0] return df - @classmethod def values( - cls, - dataset, - dim, - expanded=True, - flat=True, - compute=True, - keep_index=False, + cls, + dataset, + dim, + expanded=True, + flat=True, + compute=True, + keep_index=False, ): dim = dataset.get_dimension(dim, strict=True) - data = dataset.data[dim.name] + isindex = cls.isindex(dataset, dim) + if isindex: + data = cls.index_values(dataset, dim) + else: + data = dataset.data[dim.name] if keep_index: return data if data.dtype.kind == 'M' and getattr(data.dtype, 'tz', None): - data = data.dt.tz_localize(None) + data = (data if isindex else data.dt).tz_localize(None) if not expanded: return pd.unique(data) return data.values if hasattr(data, 'values') else data @@ -405,38 +484,53 @@ def as_dframe(cls, dataset): if it already a dataframe type. """ if issubclass(dataset.interface, PandasInterface): + if any(cls.isindex(dataset, dim) for dim in dataset.dimensions()): + return dataset.data.reset_index() return dataset.data else: return dataset.dframe() - @classmethod def dframe(cls, dataset, dimensions): + data = dataset.data if dimensions: - return dataset.data[dimensions] + if any(cls.isindex(dataset, d) for d in dimensions): + data = data.reset_index() + return data[dimensions] else: - return dataset.data.copy() - + return data.copy() @classmethod def iloc(cls, dataset, index): rows, cols = index scalar = False - columns = list(dataset.data.columns) if isinstance(cols, slice): cols = [d.name for d in dataset.dimensions()][cols] elif np.isscalar(cols): scalar = np.isscalar(rows) - cols = [dataset.get_dimension(cols).name] + dim = dataset.get_dimension(cols) + if dim is None: + raise ValueError('column is out of bounds') + cols = [dim.name] else: - cols = [dataset.get_dimension(d).name for d in index[1]] - cols = [columns.index(c) for c in cols] + cols = [dataset.get_dimension(d).name for d in cols] if np.isscalar(rows): rows = [rows] + data = dataset.data + indexes = cls.indexes(data) + columns = list(data.columns) + id_cols = [columns.index(c) for c in cols if c not in indexes] + if not id_cols: + if len(indexes) > 1: + data = data.index.to_frame()[cols].iloc[rows].reset_index(drop=True) + data = data.values.ravel()[0] if scalar else data + else: + data = data.index.values[rows[0]] if scalar else data.index[rows] + return data if scalar: - return dataset.data.iloc[rows[0], cols[0]] - return dataset.data.iloc[rows, cols] + return data.iloc[rows[0], id_cols[0]] + return data.iloc[rows, id_cols] Interface.register(PandasInterface) diff --git a/holoviews/tests/core/data/test_pandasinterface.py b/holoviews/tests/core/data/test_pandasinterface.py index 415ec6acce..bb2e520d7e 100644 --- a/holoviews/tests/core/data/test_pandasinterface.py +++ b/holoviews/tests/core/data/test_pandasinterface.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest from holoviews.core.data import Dataset from holoviews.core.data.interface import DataError @@ -163,6 +164,11 @@ def test_dataset_with_interface_column(self): ds = Dataset(df) self.assertEqual(list(ds.data.columns), ['interface']) + def test_dataset_range_with_object_index(self): + df = pd.DataFrame(range(4), columns=["values"], index=list("BADC")) + ds = Dataset(df, kdims='index') + assert ds.range('index') == ('A', 'D') + class PandasInterfaceTests(BasePandasInterfaceTests): @@ -177,3 +183,211 @@ def test_data_with_tz(self): df = pd.DataFrame({"dates": dates_tz}) data = Dataset(df).dimension_values("dates") np.testing.assert_equal(dates, data) + + @pytest.mark.xfail(reason="Breaks hvplot") + def test_reindex(self): + ds = Dataset(pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})) + df = ds.interface.reindex(ds, ['x']) + assert df.index.names == ['x'] + df = ds.interface.reindex(ds, ['y']) + assert df.index.names == ['y'] + + +class PandasInterfaceMultiIndex(HeterogeneousColumnTests, InterfaceTests): + datatype = 'dataframe' + data_type = pd.DataFrame + + __test__ = True + + def setUp(self): + frame = pd.DataFrame({"number": [1, 1, 2, 2], "color": ["red", "blue", "red", "blue"]}) + index = pd.MultiIndex.from_frame(frame, names=("number", "color")) + self.df = pd.DataFrame(range(4), index=index, columns=["values"]) + super().setUp() + + def test_lexsort_depth_import(self): + # Indexing relies on knowing the lexsort_depth but this is a + # private import so we want to know should this import ever + # be changed + from pandas.core.indexes.multi import _lexsort_depth # noqa + + def test_no_kdims(self): + ds = Dataset(self.df) + assert ds.kdims == [Dimension("values")] + assert isinstance(ds.data.index, pd.MultiIndex) + + def test_index_kdims(self): + ds = Dataset(self.df, kdims=["number", "color"]) + assert ds.kdims == [Dimension("number"), Dimension("color")] + assert ds.vdims == [Dimension("values")] + assert isinstance(ds.data.index, pd.MultiIndex) + + def test_index_aggregate(self): + ds = Dataset(self.df, kdims=["number", "color"]) + expected = pd.DataFrame({'number': [1, 2], 'values': [0.5, 2.5], 'values_var': [0.25, 0.25]}) + agg = ds.aggregate("number", function=np.mean, spreadfn=np.var) + pd.testing.assert_frame_equal(agg.data, expected) + + def test_index_select_monotonic(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=1) + expected = pd.DataFrame({'color': ['red', 'blue'], 'values': [0, 1], 'number': [1, 1]}).set_index(['number', 'color']) + assert isinstance(selected.data.index, pd.MultiIndex) + pd.testing.assert_frame_equal(selected.data, expected) + + def test_index_select(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=1) + expected = pd.DataFrame({'color': ['red', 'blue'], 'values': [0, 1], 'number': [1, 1]}).set_index(['number', 'color']) + assert isinstance(selected.data.index, pd.MultiIndex) + pd.testing.assert_frame_equal(selected.data, expected) + + def test_index_select_all_indexes(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=1, color='red') + assert selected == 0 + + def test_index_select_all_indexes_lists(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=[1], color=['red']) + expected = pd.DataFrame({'color': ['red'], 'values': [0], 'number': [1]}).set_index(['number', 'color']) + assert isinstance(selected.data.index, pd.MultiIndex) + pd.testing.assert_frame_equal(selected.data, expected) + + def test_index_select_all_indexes_slice_and_scalar(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=(0, 1), color='red') + expected = pd.DataFrame({'color': ['red'], 'values': [0], 'number': [1]}).set_index(['number', 'color']) + assert isinstance(selected.data.index, pd.MultiIndex) + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_scalar_scalar_only_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[0, 0] + expected = 1 + assert selected == expected + + def test_iloc_slice_scalar_only_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[:, 0] + expected = self.df.reset_index()[["number"]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_slice_slice_only_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[:, :2] + expected = self.df.reset_index()[["number", "color"]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_scalar_slice_only_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[0, :2] + expected = pd.DataFrame({"number": 1, "color": "red"}, index=[0]) + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_scalar_scalar(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[0, 2] + expected = 0 + assert selected == expected + + def test_iloc_slice_scalar(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[:, 2] + expected = self.df.iloc[:, [0]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_slice_slice(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[:, :3] + expected = self.df.iloc[:, [0]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_iloc_scalar_slice(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.iloc[0, :3] + expected = self.df.iloc[[0], [0]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_out_of_bounds(self): + ds = Dataset(self.df, kdims=["number", "color"]) + with pytest.raises(ValueError, match="column is out of bounds"): + ds.iloc[0, 3] + + def test_sort(self): + ds = Dataset(self.df, kdims=["number", "color"]) + sorted_ds = ds.sort("color") + np.testing.assert_array_equal(sorted_ds.dimension_values("values"), [1, 3, 0, 2]) + np.testing.assert_array_equal(sorted_ds.dimension_values("number"), [1, 2, 1, 2]) + + def test_select_monotonic(self): + ds = Dataset(self.df.sort_index(), kdims=["number", "color"]) + selected = ds.select(color="red") + pd.testing.assert_frame_equal(selected.data, self.df.iloc[[0, 2], :]) + + selected = ds.select(number=1, color='red') + assert selected == 0 + + def test_select_not_monotonic(self): + frame = pd.DataFrame({"number": [1, 1, 2, 2], "color": [2, 1, 2, 1]}) + index = pd.MultiIndex.from_frame(frame, names=frame.columns) + df = pd.DataFrame(range(4), index=index, columns=["values"]) + ds = Dataset(df, kdims=list(frame.columns)) + + data = ds.select(color=slice(2, 3)).data + expected = pd.DataFrame({"number": [1, 2], "color": [2, 2], "values": [0, 2]}).set_index(['number', 'color']) + pd.testing.assert_frame_equal(data, expected) + + def test_select_not_in_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + selected = ds.select(number=[2, 3]) + expected = self.df.loc[[2]] + pd.testing.assert_frame_equal(selected.data, expected) + + def test_sample(self): + ds = Dataset(self.df, kdims=["number", "color"]) + sample = ds.interface.sample(ds, [1]) + assert sample.to_dict() == {'values': {(1, 'blue'): 1}} + + self.df.iloc[0, 0] = 1 + ds = Dataset(self.df, kdims=["number", "color"]) + sample = ds.interface.sample(ds, [1]) + assert sample.to_dict() == {'values': {(1, 'red'): 1, (1, 'blue'): 1}} + + def test_values(self): + ds = Dataset(self.df, kdims=["number", "color"]) + assert (ds.interface.values(ds, 'color') == ['red', 'blue', 'red', 'blue']).all() + assert (ds.interface.values(ds, 'number') == [1, 1, 2, 2]).all() + assert (ds.interface.values(ds, 'values') == [0, 1, 2, 3]).all() + + def test_reindex(self): + ds = Dataset(self.df, kdims=["number", "color"]) + df = ds.interface.reindex(ds, ['number', 'color']) + assert df.index.names == ['number', 'color'] + + df = ds.interface.reindex(ds, ['number']) + assert df.index.names == ['number'] + + df = ds.interface.reindex(ds, ['values']) + assert df.index.names == ['values'] + + def test_groupby_one_index(self): + ds = Dataset(self.df, kdims=["number", "color"]) + grouped = ds.groupby("number") + assert list(grouped.keys()) == [1, 2] + for k, v in grouped.items(): + pd.testing.assert_frame_equal(v.data, ds.select(number=k).data) + + def test_groupby_two_indexes(self): + ds = Dataset(self.df, kdims=["number", "color"]) + grouped = ds.groupby(["number", "color"]) + assert list(grouped.keys()) == list(self.df.index) + for k, v in grouped.items(): + pd.testing.assert_frame_equal(v.data, ds.select(number=[k[0]], color=[k[1]]).data) + + def test_groupby_one_index_one_column(self): + ds = Dataset(self.df, kdims=["number", "color"]) + grouped = ds.groupby('values') + assert list(grouped.keys()) == [0, 1, 2, 3] + for k, v in grouped.items(): + pd.testing.assert_frame_equal(v.data, ds.select(values=k).data) diff --git a/holoviews/tests/operation/test_downsample.py b/holoviews/tests/operation/test_downsample.py index 066a0dd6b8..3ca6a4c929 100644 --- a/holoviews/tests/operation/test_downsample.py +++ b/holoviews/tests/operation/test_downsample.py @@ -49,8 +49,6 @@ def _compute_mask(self, element): assert runs[0] == 1 -# Should be fixed when https://github.com/holoviz/holoviews/pull/6061 is merged -@pytest.mark.xfail(reason="This will make a copy of the data") def test_downsample1d_shared_data_index(): runs = [0]