diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index a13dd7883af2..3d62f1e588ca 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4339,7 +4339,7 @@ def to_numpy( """ def raise_no_zero_copy() -> None: - if zero_copy_only: + if zero_copy_only and not self.is_empty(): msg = "cannot return a zero-copy array" raise ValueError(msg) diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 1c95acb7d0ad..bffdcef0772b 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -195,6 +195,11 @@ impl PySeries { let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); np_arr.into_py(py) }, + Categorical(_, _) | Enum(_, _) => { + let ca = s.categorical().unwrap(); + let np_arr = PyArray1::from_iter(py, ca.iter_str().map(|s| s.into_py(py))); + np_arr.into_py(py) + }, #[cfg(feature = "object")] Object(_, _) => { let ca = s diff --git a/py-polars/tests/unit/interop/numpy/test_numpy.py b/py-polars/tests/unit/interop/numpy/test_numpy.py index 80f11c63c886..55391c56d556 100644 --- a/py-polars/tests/unit/interop/numpy/test_numpy.py +++ b/py-polars/tests/unit/interop/numpy/test_numpy.py @@ -76,3 +76,14 @@ def test_to_numpy_zero_copy_path() -> None: x = df.to_numpy() assert x.flags["F_CONTIGUOUS"] assert str(x[0, :]) == "[1. 2. 1. 1. 1.]" + + +def test_numpy_disambiguation() -> None: + a = np.array([1, 2]) + df = pl.DataFrame({"a": a}) + result = df.with_columns(b=a).to_dict(as_series=False) # type: ignore[arg-type] + expected = { + "a": [1, 2], + "b": [1, 2], + } + assert result == expected diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py index e77496d7289f..4419ed71ce24 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py @@ -1,5 +1,6 @@ from __future__ import annotations +from decimal import Decimal as D from typing import TYPE_CHECKING import numpy as np @@ -101,3 +102,20 @@ def test__array__() -> None: expected_array = np.array([[1, 1], [2, 2], [3, 3]], dtype=np.uint8) assert_array_equal(out_array, expected_array) assert out_array.flags["F_CONTIGUOUS"] is True + + +def test_numpy_preserve_uint64_4112() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash()) + assert df.to_numpy().dtype == np.dtype("uint64") + assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")]) + + +@pytest.mark.parametrize("use_pyarrow", [True, False]) +def test_df_to_numpy_decimal(use_pyarrow: bool) -> None: + decimal_data = [D("1.234"), D("2.345"), D("-3.456")] + df = pl.Series("n", decimal_data).to_frame() + + result = df.to_numpy(use_pyarrow=use_pyarrow) + + expected = np.array(decimal_data).reshape((-1, 1)) + assert_array_equal(result, expected) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index eb3042b15210..0cf505dc39a9 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -1,7 +1,8 @@ from __future__ import annotations -from datetime import datetime, time, timedelta +from datetime import date, datetime, time, timedelta from decimal import Decimal as D +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -16,6 +17,264 @@ import numpy.typing as npt +def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None: + if s.len() == 0: + return + s_ptr = s._get_buffers()["values"]._get_buffer_info()[0] + arr_ptr = arr.__array_interface__["data"][0] + assert s_ptr == arr_ptr + + +def assert_zero_copy_only_raises(s: pl.Series) -> None: + with pytest.raises(ValueError, match="cannot return a zero-copy array"): + s.to_numpy(use_pyarrow=False, zero_copy_only=True) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Int8, np.int8), + (pl.Int16, np.int16), + (pl.Int32, np.int32), + (pl.Int64, np.int64), + (pl.UInt8, np.uint8), + (pl.UInt16, np.uint16), + (pl.UInt32, np.uint32), + (pl.UInt64, np.uint64), + (pl.Float32, np.float32), + (pl.Float64, np.float64), + ], +) +def test_series_to_numpy_numeric_zero_copy( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + s = pl.Series([1, 2, 3]).cast(dtype) # =dtype, strict=False) + result = s.to_numpy(use_pyarrow=False, zero_copy_only=True) + + assert_zero_copy(s, result) + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Int8, np.float32), + (pl.Int16, np.float32), + (pl.Int32, np.float64), + (pl.Int64, np.float64), + (pl.UInt8, np.float32), + (pl.UInt16, np.float32), + (pl.UInt32, np.float64), + (pl.UInt64, np.float64), + (pl.Float32, np.float32), + (pl.Float64, np.float64), + ], +) +def test_series_to_numpy_numeric_with_nulls( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + s = pl.Series([1, 2, None], dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + + assert result.tolist()[:-1] == s.to_list()[:-1] + assert np.isnan(result[-1]) + assert result.dtype == expected_dtype + assert_zero_copy_only_raises(s) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Duration, np.dtype("timedelta64[us]")), + (pl.Duration("ms"), np.dtype("timedelta64[ms]")), + (pl.Duration("us"), np.dtype("timedelta64[us]")), + (pl.Duration("ns"), np.dtype("timedelta64[ns]")), + (pl.Datetime, np.dtype("datetime64[us]")), + (pl.Datetime("ms"), np.dtype("datetime64[ms]")), + (pl.Datetime("us"), np.dtype("datetime64[us]")), + (pl.Datetime("ns"), np.dtype("datetime64[ns]")), + ], +) +def test_series_to_numpy_temporal_zero_copy( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + values = [0, 2_000, 1_000_000] + s = pl.Series(values, dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False, zero_copy_only=True) + + assert_zero_copy(s, result) + # NumPy tolist returns integers for ns precision + if s.dtype.time_unit == "ns": # type: ignore[attr-defined] + assert result.tolist() == values + else: + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + + +def test_series_to_numpy_date() -> None: + values = [date(1970, 1, 1), date(2024, 2, 28)] + s = pl.Series(values) + + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.dtype("datetime64[D]") + assert_zero_copy_only_raises(s) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Date, np.dtype("datetime64[D]")), + (pl.Duration("ms"), np.dtype("timedelta64[ms]")), + (pl.Duration("us"), np.dtype("timedelta64[us]")), + (pl.Duration("ns"), np.dtype("timedelta64[ns]")), + (pl.Datetime, np.dtype("datetime64[us]")), + (pl.Datetime("ms"), np.dtype("datetime64[ms]")), + (pl.Datetime("us"), np.dtype("datetime64[us]")), + (pl.Datetime("ns"), np.dtype("datetime64[ns]")), + ], +) +def test_series_to_numpy_temporal_with_nulls( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + values = [0, 2_000, 1_000_000, None] + s = pl.Series(values, dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + + # NumPy tolist returns integers for ns precision + if getattr(s.dtype, "time_unit", None) == "ns": + assert result.tolist() == values + else: + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + assert_zero_copy_only_raises(s) + + +@pytest.mark.parametrize( + ("dtype", "values"), + [ + (pl.Time, [time(10, 30, 45), time(23, 59, 59)]), + (pl.Categorical, ["a", "b", "a"]), + (pl.Enum(["a", "b", "c"]), ["a", "b", "a"]), + (pl.String, ["a", "bc", "def"]), + (pl.Binary, [b"a", b"bc", b"def"]), + (pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]), + (pl.Object, [Path(), Path("abc")]), + # TODO: Implement for List types + # (pl.List, [[1], [2, 3]]), + # (pl.List, [["a"], ["b", "c"], []]), + ], +) +@pytest.mark.parametrize("with_nulls", [False, True]) +def test_to_numpy_object_dtypes( + dtype: pl.PolarsDataType, values: list[Any], with_nulls: bool +) -> None: + if with_nulls: + values.append(None) + + s = pl.Series(values, dtype=dtype) + result = s.to_numpy(use_pyarrow=False) + + assert result.tolist() == values + assert result.dtype == np.object_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_bool() -> None: + s = pl.Series([True, False]) + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.bool_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_bool_with_nulls() -> None: + s = pl.Series([True, False, None]) + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.object_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_array_of_int() -> None: + values = [[1, 2], [3, 4], [5, 6]] + s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array(values) + assert_array_equal(result, expected) + assert result.dtype == np.int64 + + +def test_series_to_numpy_array_of_str() -> None: + values = [["1", "2", "3"], ["4", "5", "10000"]] + s = pl.Series(values, dtype=pl.Array(pl.String, 3)) + result = s.to_numpy(use_pyarrow=False) + assert result.tolist() == values + assert result.dtype == np.object_ + + +@pytest.mark.skip( + reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268" +) +def test_series_to_numpy_array_with_nulls() -> None: + values = [[1, 2], [3, 4], None] + s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]]) + assert_array_equal(result, expected) + assert result.dtype == np.float64 + assert_zero_copy_only_raises(s) + + +def test_to_numpy_null() -> None: + s = pl.Series([None, None], dtype=pl.Null) + result = s.to_numpy(use_pyarrow=False) + expected = np.array([np.nan, np.nan], dtype=np.float32) + assert_array_equal(result, expected) + assert result.dtype == np.float32 + assert_zero_copy_only_raises(s) + + +def test_to_numpy_empty() -> None: + series = pl.Series(dtype=pl.String) + result = series.to_numpy(use_pyarrow=False, zero_copy_only=True) + assert result.dtype == np.object_ + assert result.shape == (0,) + assert result.size == 0 + + +def test_series_to_numpy_temporal() -> None: + s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date) + s1 = pl.Series( + "datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)] + ) + s2 = pl.datetime_range( + datetime(2021, 1, 1, 0), + datetime(2021, 1, 1, 1), + interval="1h", + time_unit="ms", + eager=True, + ) + assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']" + assert ( + str(s1.to_numpy()[:2]) + == "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']" + ) + assert ( + str(s2.to_numpy()[:2]) + == "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']" + ) + s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)]) + out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]") + assert (s3.to_numpy() == out).all() + + @given( s=series( min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct] @@ -29,7 +288,7 @@ ) @settings(max_examples=250) def test_series_to_numpy(s: pl.Series) -> None: - result = s.to_numpy() + result = s.to_numpy(use_pyarrow=False) values = s.to_list() dtype_map = { @@ -44,26 +303,6 @@ def test_series_to_numpy(s: pl.Series) -> None: assert_array_equal(result, expected) -@pytest.mark.parametrize("use_pyarrow", [True, False]) -@pytest.mark.parametrize("has_null", [True, False]) -@pytest.mark.parametrize("dtype", [pl.Time, pl.Boolean, pl.String]) -def test_to_numpy_no_zero_copy( - use_pyarrow: bool, has_null: bool, dtype: pl.PolarsDataType -) -> None: - data: list[Any] = ["a", None] if dtype == pl.String else [0, None] - series = pl.Series(data if has_null else data[:1], dtype=dtype) - with pytest.raises(ValueError): - series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow) - - -def test_to_numpy_empty_no_pyarrow() -> None: - series = pl.Series([], dtype=pl.Null) - result = series.to_numpy() - assert result.dtype == pl.Float32 - assert result.shape == (0,) - assert result.size == 0 - - @pytest.mark.parametrize("writable", [False, True]) @pytest.mark.parametrize("pyarrow_available", [False, True]) def test_to_numpy2( @@ -133,132 +372,3 @@ def test_view_deprecated() -> None: result = s.view() assert isinstance(result, np.ndarray) assert np.all(result == np.array([1.0, 2.5, 3.0])) - - -def test_numpy_disambiguation() -> None: - a = np.array([1, 2]) - df = pl.DataFrame({"a": a}) - result = df.with_columns(b=a).to_dict(as_series=False) # type: ignore[arg-type] - expected = { - "a": [1, 2], - "b": [1, 2], - } - assert result == expected - - -def test_series_to_numpy_bool() -> None: - s = pl.Series([True, False]) - result = s.to_numpy(use_pyarrow=False) - assert s.to_list() == result.tolist() - assert result.dtype == np.bool_ - - -def test_series_to_numpy_bool_with_nulls() -> None: - s = pl.Series([True, False, None]) - result = s.to_numpy(use_pyarrow=False) - assert s.to_list() == result.tolist() - assert result.dtype == np.object_ - - -def test_array_to_numpy() -> None: - s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(pl.Int64, 2)) - assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all() - - -def test_numpy_preserve_uint64_4112() -> None: - df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash()) - assert df.to_numpy().dtype == np.dtype("uint64") - assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")]) - - -def test_to_numpy_datelike() -> None: - s = pl.Series( - "dt", - [ - datetime(2022, 7, 5, 10, 30, 45, 123456), - None, - datetime(2023, 2, 5, 15, 22, 30, 987654), - ], - ) - assert str(s.to_numpy()) == str( - np.array( - ["2022-07-05T10:30:45.123456", "NaT", "2023-02-05T15:22:30.987654"], - dtype="datetime64[us]", - ) - ) - assert str(s.drop_nulls().to_numpy()) == str( - np.array( - ["2022-07-05T10:30:45.123456", "2023-02-05T15:22:30.987654"], - dtype="datetime64[us]", - ) - ) - - -def test_series_to_numpy_temporal() -> None: - s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date) - s1 = pl.Series( - "datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)] - ) - s2 = pl.datetime_range( - datetime(2021, 1, 1, 0), - datetime(2021, 1, 1, 1), - interval="1h", - time_unit="ms", - eager=True, - ) - assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']" - assert ( - str(s1.to_numpy()[:2]) - == "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']" - ) - assert ( - str(s2.to_numpy()[:2]) - == "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']" - ) - s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)]) - out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]") - assert (s3.to_numpy() == out).all() - - s4 = pl.Series([time(10, 30, 45), time(23, 59, 59)]) - out = np.array([time(10, 30, 45), time(23, 59, 59)], dtype="object") - assert (s4.to_numpy() == out).all() - - -@pytest.mark.parametrize("use_pyarrow", [True, False]) -def test_decimal_numpy_export(use_pyarrow: bool) -> None: - decimal_data = [D("1.234"), D("2.345"), D("-3.456")] - - s = pl.Series("n", decimal_data) - df = s.to_frame() - - assert_array_equal( - np.array(decimal_data), - s.to_numpy(use_pyarrow=use_pyarrow), - ) - assert_array_equal( - np.array(decimal_data).reshape((-1, 1)), - df.to_numpy(use_pyarrow=use_pyarrow), - ) - - -@pytest.mark.parametrize( - ("dtype", "expected_dtype"), - [ - (pl.Int8, np.float32), - (pl.Int16, np.float32), - (pl.Int32, np.float64), - (pl.Int64, np.float64), - (pl.UInt8, np.float32), - (pl.UInt16, np.float32), - (pl.UInt32, np.float64), - (pl.UInt64, np.float64), - (pl.Float32, np.float32), - (pl.Float64, np.float64), - ], -) -def test_series_to_numpy_numeric_with_nulls( - dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike -) -> None: - s = pl.Series([1, 2, None], dtype=dtype, strict=False) - result = s.to_numpy(use_pyarrow=False) - assert result.dtype == expected_dtype