From a28aab005b42eabe0b1651d2330ed2f3268bb9f8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 25 May 2018 20:29:45 -0700 Subject: [PATCH] Fix DataArray.stack() with non-unique coordinates on pandas 0.23 (#2168) --- doc/whats-new.rst | 4 ++++ xarray/core/utils.py | 14 ++++++++------ xarray/tests/test_dataarray.py | 7 +++++++ xarray/tests/test_utils.py | 12 +++++++++++- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4a01065bd70..055369f0352 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -59,6 +59,10 @@ Bug fixes dimension were improperly skipped. By `Stephan Hoyer `_ +- Fix :meth:`~DataArray.stack` with non-unique coordinates on pandas 0.23 + (:issue:`2160`). + By `Stephan Hoyer `_ + - Selecting data indexed by a length-1 ``CFTimeIndex`` with a slice of strings now behaves as it does when using a length-1 ``DatetimeIndex`` (i.e. it no longer falsely returns an empty array when the slice includes the value in diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 06bb3ede393..f6c5830cc9e 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -76,13 +76,12 @@ def safe_cast_to_index(array): def multiindex_from_product_levels(levels, names=None): """Creating a MultiIndex from a product without refactorizing levels. - Keeping levels the same is faster, and also gives back the original labels - when we unstack. + Keeping levels the same gives back the original labels when we unstack. Parameters ---------- - levels : sequence of arrays - Unique labels for each level. + levels : sequence of pd.Index + Values for each MultiIndex level. names : optional sequence of objects Names for each level. @@ -90,8 +89,11 @@ def multiindex_from_product_levels(levels, names=None): ------- pandas.MultiIndex """ - labels_mesh = np.meshgrid(*[np.arange(len(lev)) for lev in levels], - indexing='ij') + if any(not isinstance(lev, pd.Index) for lev in levels): + raise TypeError('levels must be a list of pd.Index objects') + + split_labels, levels = zip(*[lev.factorize() for lev in levels]) + labels_mesh = np.meshgrid(*split_labels, indexing='ij') labels = [x.ravel() for x in labels_mesh] return pd.MultiIndex(levels, labels, sortorder=0, names=names) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 35e270f0db7..a03d265c3e3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1673,6 +1673,13 @@ def test_unstack_pandas_consistency(self): actual = DataArray(s, dims='z').unstack('z') assert_identical(expected, actual) + def test_stack_nonunique_consistency(self): + orig = DataArray([[0, 1], [2, 3]], dims=['x', 'y'], + coords={'x': [0, 1], 'y': [0, 0]}) + actual = orig.stack(z=['x', 'y']) + expected = DataArray(orig.to_pandas().stack(), dims='z') + assert_identical(expected, actual) + def test_transpose(self): assert_equal(self.dv.variable.transpose(), self.dv.transpose().variable) diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 0b3b0ee7dd6..1f73743d01d 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -72,7 +72,8 @@ def test_safe_cast_to_index_datetime_datetime(enable_cftimeindex): def test_multiindex_from_product_levels(): - result = utils.multiindex_from_product_levels([['b', 'a'], [1, 3, 2]]) + result = utils.multiindex_from_product_levels( + [pd.Index(['b', 'a']), pd.Index([1, 3, 2])]) np.testing.assert_array_equal( result.labels, [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) np.testing.assert_array_equal(result.levels[0], ['b', 'a']) @@ -82,6 +83,15 @@ def test_multiindex_from_product_levels(): np.testing.assert_array_equal(result.values, other.values) +def test_multiindex_from_product_levels_non_unique(): + result = utils.multiindex_from_product_levels( + [pd.Index(['b', 'a']), pd.Index([1, 1, 2])]) + np.testing.assert_array_equal( + result.labels, [[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1]]) + np.testing.assert_array_equal(result.levels[0], ['b', 'a']) + np.testing.assert_array_equal(result.levels[1], [1, 2]) + + class TestArrayEquiv(TestCase): def test_0d(self): # verify our work around for pd.isnull not working for 0-dimensional