From d07b23834381f4d23ff317c39ee83852c1eefbec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 13:43:19 -0500 Subject: [PATCH] REGR: NA-values in ctors with string dtype ```python In [1]: import pandas as pd In [2]: pd.Series([1, 2, None], dtype='str')[2] # None ``` Closes #21083 --- doc/source/whatsnew/v0.23.1.txt | 8 ++++++ pandas/core/series.py | 15 ++++++++++- pandas/tests/frame/test_constructors.py | 8 ++++++ pandas/tests/series/test_constructors.py | 33 +++++++++++++++++++----- 4 files changed, 57 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index e29cb0a5a2626..6c5238a56be08 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,6 +10,14 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0231.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) + + .. _whatsnew_0231.enhancements: New features diff --git a/pandas/core/series.py b/pandas/core/series.py index 8bd48c629ffef..02a4cbfce9929 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4054,7 +4054,20 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = np.array(subarr, dtype=dtype, copy=copy) + subarr2 = np.array(subarr, dtype=dtype, copy=copy) + + if dtype and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(subarr) + subarr2 = subarr2.astype(object) + subarr2[na_values] = np.asarray(subarr)[na_values] + + subarr = subarr2 except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 300e1acdea911..3a813ec6032fc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,6 +151,14 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype + def test_constructor_dtype_str_na_values(self): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({'A': ['x', None]}, dtype=str) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7e59325c32ddc..7a09e2abecb77 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -29,6 +29,17 @@ from .common import TestData +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param + + class TestSeriesConstructors(TestData): def test_invalid_dtype(self): @@ -137,6 +148,14 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_dtype_str_na_values(self): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(['x', None], dtype=str) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -164,22 +183,24 @@ def test_constructor_list_like(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + assert_series_equal(result, expected) - for dtype in ['str', str, 'U']: - result = Series(input_vals, dtype=dtype) - expected = Series(input_vals).astype(dtype) - assert_series_equal(result, expected) + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(['1.0', '2.0', None], dtype=object) + assert_series_equal(result, expected) def test_constructor_generator(self): gen = (i for i in range(10))