From 4e34fabff1cf6f00627eff39c9f6f25d64be9ca8 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 00:28:19 +0100 Subject: [PATCH 1/8] Add support for Categorical/Enum --- py-polars/src/series/export.rs | 11 +++++++++++ .../unit/interop/numpy/test_to_numpy_series.py | 14 +++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 1c95acb7d0ad..4024e27e4f2d 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -195,6 +195,17 @@ impl PySeries { let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); np_arr.into_py(py) }, + Categorical(rev_map, _) | Enum(rev_map, _) => { + let rev_map = rev_map.clone().unwrap(); + let mapping = &*rev_map; + let f = |idx: u32| mapping.get(idx); + let ca = s.categorical().unwrap(); + let np_arr = PyArray1::from_iter( + py, + ca.physical().into_iter().map(|s| s.map(f).into_py(py)), + ); + np_arr.into_py(py) + }, #[cfg(feature = "object")] Object(_, _) => { let ca = s diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index eb3042b15210..304b65a412de 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -29,7 +29,7 @@ ) @settings(max_examples=250) def test_series_to_numpy(s: pl.Series) -> None: - result = s.to_numpy() + result = s.to_numpy(use_pyarrow=False) values = s.to_list() dtype_map = { @@ -64,6 +64,18 @@ def test_to_numpy_empty_no_pyarrow() -> None: assert result.size == 0 +def test_to_numpy_categorical() -> None: + s = pl.Series(["a", "b", "a", None], dtype=pl.Categorical) + result = s.to_numpy(use_pyarrow=False) + assert result.tolist() == s.to_list() + + +def test_to_numpy_enum() -> None: + s = pl.Series(["a", "b", "a", None], dtype=pl.Enum(["a", "b", "c"])) + result = s.to_numpy(use_pyarrow=False) + assert result.tolist() == s.to_list() + + @pytest.mark.parametrize("writable", [False, True]) @pytest.mark.parametrize("pyarrow_available", [False, True]) def test_to_numpy2( From eb296292a64487365202de9ca50afc9df6cd66ff Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 00:37:29 +0100 Subject: [PATCH 2/8] Add test for Null --- .../tests/unit/interop/numpy/test_to_numpy_series.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 304b65a412de..42e92d39bbfa 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -76,6 +76,14 @@ def test_to_numpy_enum() -> None: assert result.tolist() == s.to_list() +def test_to_numpy_null() -> None: + s = pl.Series([None, None], dtype=pl.Null) + result = s.to_numpy(use_pyarrow=False) + expected = np.array([np.nan, np.nan], dtype=np.float32) + assert_array_equal(result, expected) + assert result.dtype == np.float32 + + @pytest.mark.parametrize("writable", [False, True]) @pytest.mark.parametrize("pyarrow_available", [False, True]) def test_to_numpy2( From d82050c666c0eb28b3c30b034b84e7b6303f9a3d Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 09:50:46 +0100 Subject: [PATCH 3/8] More tests --- .../unit/interop/numpy/test_to_numpy_df.py | 6 + .../interop/numpy/test_to_numpy_series.py | 226 ++++++++++++------ 2 files changed, 156 insertions(+), 76 deletions(-) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py index e77496d7289f..16a1bb89862e 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py @@ -101,3 +101,9 @@ def test__array__() -> None: expected_array = np.array([[1, 1], [2, 2], [3, 3]], dtype=np.uint8) assert_array_equal(out_array, expected_array) assert out_array.flags["F_CONTIGUOUS"] is True + + +def test_numpy_preserve_uint64_4112() -> None: + df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash()) + assert df.to_numpy().dtype == np.dtype("uint64") + assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")]) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 42e92d39bbfa..2f62eef8403f 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -2,6 +2,7 @@ from datetime import datetime, time, timedelta from decimal import Decimal as D +from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np @@ -16,6 +17,155 @@ import numpy.typing as npt +def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None: + s_ptr = s._get_buffers()["values"]._get_buffer_info()[0] + arr_ptr = arr.__array_interface__["data"][0] + assert s_ptr == arr_ptr + + +def assert_zero_copy_only_raises(s: pl.Series) -> None: + with pytest.raises(ValueError, match="cannot return a zero-copy array"): + s.to_numpy(use_pyarrow=False, zero_copy_only=True) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Int8, np.int8), + (pl.Int16, np.int16), + (pl.Int32, np.int32), + (pl.Int64, np.int64), + (pl.UInt8, np.uint8), + (pl.UInt16, np.uint16), + (pl.UInt32, np.uint32), + (pl.UInt64, np.uint64), + (pl.Float32, np.float32), + (pl.Float64, np.float64), + ], +) +def test_series_to_numpy_numeric_zero_copy( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + s = pl.Series([1, 2, 3], dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + + assert_zero_copy(s, result) + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Int8, np.float32), + (pl.Int16, np.float32), + (pl.Int32, np.float64), + (pl.Int64, np.float64), + (pl.UInt8, np.float32), + (pl.UInt16, np.float32), + (pl.UInt32, np.float64), + (pl.UInt64, np.float64), + (pl.Float32, np.float32), + (pl.Float64, np.float64), + ], +) +def test_series_to_numpy_numeric_with_nulls( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + s = pl.Series([1, 2, None], dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + + assert result.tolist()[:-1] == s.to_list()[:-1] + assert np.isnan(result[-1]) + assert result.dtype == expected_dtype + assert_zero_copy_only_raises(s) + + +@pytest.mark.parametrize( + ("dtype", "values"), + [ + (pl.Categorical, ["a", "b", "a"]), + (pl.Enum(["a", "b", "c"]), ["a", "b", "a"]), + (pl.String, ["a", "bc", "def"]), + (pl.Binary, [b"a", b"bc", b"def"]), + (pl.Object, [Path(), Path("abc")]), + # (pl.List, [[1], [2, 3]]), + # (pl.List, [["a"], ["b", "c"], []]), + ], +) +def test_to_numpy_various_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None: + values.append(None) + s = pl.Series(values, dtype=dtype) + result = s.to_numpy(use_pyarrow=False) + + assert result.tolist() == values + assert result.dtype == np.object_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_bool() -> None: + s = pl.Series([True, False]) + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.bool_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_bool_with_nulls() -> None: + s = pl.Series([True, False, None]) + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.object_ + assert_zero_copy_only_raises(s) + + +def test_series_to_numpy_array_of_int() -> None: + values = [[1, 2], [3, 4], [5, 6]] + s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array(values) + assert_array_equal(result, expected) + assert result.dtype == np.int64 + + +def test_series_to_numpy_array_of_str() -> None: + values = [["1", "2", "3"], ["4", "5", "10000"]] + s = pl.Series(values, dtype=pl.Array(pl.String, 3)) + result = s.to_numpy(use_pyarrow=False) + assert result.tolist() == values + assert result.dtype == np.object_ + + +def test_series_to_numpy_array_with_nulls() -> None: + values = [[1, 2], [3, 4], None] + s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) + result = s.to_numpy(use_pyarrow=False) + + expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]]) + assert_array_equal(result, expected) + assert result.dtype == np.float64 + assert_zero_copy_only_raises(s) + + +def test_to_numpy_null() -> None: + s = pl.Series([None, None], dtype=pl.Null) + result = s.to_numpy(use_pyarrow=False) + expected = np.array([np.nan, np.nan], dtype=np.float32) + assert_array_equal(result, expected) + assert result.dtype == np.float32 + + +def test_to_numpy_empty() -> None: + series = pl.Series() + result = series.to_numpy(use_pyarrow=False) + assert result.dtype == np.float32 + assert result.shape == (0,) + assert result.size == 0 + + @given( s=series( min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct] @@ -56,34 +206,6 @@ def test_to_numpy_no_zero_copy( series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow) -def test_to_numpy_empty_no_pyarrow() -> None: - series = pl.Series([], dtype=pl.Null) - result = series.to_numpy() - assert result.dtype == pl.Float32 - assert result.shape == (0,) - assert result.size == 0 - - -def test_to_numpy_categorical() -> None: - s = pl.Series(["a", "b", "a", None], dtype=pl.Categorical) - result = s.to_numpy(use_pyarrow=False) - assert result.tolist() == s.to_list() - - -def test_to_numpy_enum() -> None: - s = pl.Series(["a", "b", "a", None], dtype=pl.Enum(["a", "b", "c"])) - result = s.to_numpy(use_pyarrow=False) - assert result.tolist() == s.to_list() - - -def test_to_numpy_null() -> None: - s = pl.Series([None, None], dtype=pl.Null) - result = s.to_numpy(use_pyarrow=False) - expected = np.array([np.nan, np.nan], dtype=np.float32) - assert_array_equal(result, expected) - assert result.dtype == np.float32 - - @pytest.mark.parametrize("writable", [False, True]) @pytest.mark.parametrize("pyarrow_available", [False, True]) def test_to_numpy2( @@ -166,31 +288,6 @@ def test_numpy_disambiguation() -> None: assert result == expected -def test_series_to_numpy_bool() -> None: - s = pl.Series([True, False]) - result = s.to_numpy(use_pyarrow=False) - assert s.to_list() == result.tolist() - assert result.dtype == np.bool_ - - -def test_series_to_numpy_bool_with_nulls() -> None: - s = pl.Series([True, False, None]) - result = s.to_numpy(use_pyarrow=False) - assert s.to_list() == result.tolist() - assert result.dtype == np.object_ - - -def test_array_to_numpy() -> None: - s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(pl.Int64, 2)) - assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all() - - -def test_numpy_preserve_uint64_4112() -> None: - df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash()) - assert df.to_numpy().dtype == np.dtype("uint64") - assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")]) - - def test_to_numpy_datelike() -> None: s = pl.Series( "dt", @@ -259,26 +356,3 @@ def test_decimal_numpy_export(use_pyarrow: bool) -> None: np.array(decimal_data).reshape((-1, 1)), df.to_numpy(use_pyarrow=use_pyarrow), ) - - -@pytest.mark.parametrize( - ("dtype", "expected_dtype"), - [ - (pl.Int8, np.float32), - (pl.Int16, np.float32), - (pl.Int32, np.float64), - (pl.Int64, np.float64), - (pl.UInt8, np.float32), - (pl.UInt16, np.float32), - (pl.UInt32, np.float64), - (pl.UInt64, np.float64), - (pl.Float32, np.float32), - (pl.Float64, np.float64), - ], -) -def test_series_to_numpy_numeric_with_nulls( - dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike -) -> None: - s = pl.Series([1, 2, None], dtype=dtype, strict=False) - result = s.to_numpy(use_pyarrow=False) - assert result.dtype == expected_dtype From 41fa55c167ee299177d4bfb6029c85dd393ab114 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 10:20:39 +0100 Subject: [PATCH 4/8] Add tests --- .../unit/interop/numpy/test_to_numpy_series.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 2f62eef8403f..977d66b6cf7d 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -18,6 +18,8 @@ def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None: + if s.len() == 0: + return s_ptr = s._get_buffers()["values"]._get_buffer_info()[0] arr_ptr = arr.__array_interface__["data"][0] assert s_ptr == arr_ptr @@ -47,7 +49,7 @@ def test_series_to_numpy_numeric_zero_copy( dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike ) -> None: s = pl.Series([1, 2, 3], dtype=dtype, strict=False) - result = s.to_numpy(use_pyarrow=False) + result = s.to_numpy(use_pyarrow=False, zero_copy_only=True) assert_zero_copy(s, result) assert result.tolist() == s.to_list() @@ -89,6 +91,7 @@ def test_series_to_numpy_numeric_with_nulls( (pl.String, ["a", "bc", "def"]), (pl.Binary, [b"a", b"bc", b"def"]), (pl.Object, [Path(), Path("abc")]), + # TODO: Implement for List types # (pl.List, [[1], [2, 3]]), # (pl.List, [["a"], ["b", "c"], []]), ], @@ -139,6 +142,9 @@ def test_series_to_numpy_array_of_str() -> None: assert result.dtype == np.object_ +@pytest.mark.skip( + reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268" +) def test_series_to_numpy_array_with_nulls() -> None: values = [[1, 2], [3, 4], None] s = pl.Series(values, dtype=pl.Array(pl.Int64, 2)) @@ -156,12 +162,16 @@ def test_to_numpy_null() -> None: expected = np.array([np.nan, np.nan], dtype=np.float32) assert_array_equal(result, expected) assert result.dtype == np.float32 + assert_zero_copy_only_raises(s) +@pytest.mark.skip( + reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14274" +) def test_to_numpy_empty() -> None: - series = pl.Series() - result = series.to_numpy(use_pyarrow=False) - assert result.dtype == np.float32 + series = pl.Series(dtype=pl.String) + result = series.to_numpy(use_pyarrow=False, zero_copy_only=True) + assert result.dtype == np.int64 assert result.shape == (0,) assert result.size == 0 From cdb6fb7549913260d45f42a1643a3411cabf080a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 11:15:14 +0100 Subject: [PATCH 5/8] Fix zero copy raise on empty Series --- py-polars/polars/series/series.py | 2 +- py-polars/tests/unit/interop/numpy/test_to_numpy_series.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index a13dd7883af2..3d62f1e588ca 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4339,7 +4339,7 @@ def to_numpy( """ def raise_no_zero_copy() -> None: - if zero_copy_only: + if zero_copy_only and not self.is_empty(): msg = "cannot return a zero-copy array" raise ValueError(msg) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 977d66b6cf7d..506835fbd7ff 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -165,13 +165,10 @@ def test_to_numpy_null() -> None: assert_zero_copy_only_raises(s) -@pytest.mark.skip( - reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14274" -) def test_to_numpy_empty() -> None: series = pl.Series(dtype=pl.String) result = series.to_numpy(use_pyarrow=False, zero_copy_only=True) - assert result.dtype == np.int64 + assert result.dtype == np.object_ assert result.shape == (0,) assert result.size == 0 From 74d64388f9f8e83c6272cc4db762be11bda2e808 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 11:54:41 +0100 Subject: [PATCH 6/8] Clean up decimal tests --- .../unit/interop/numpy/test_to_numpy_df.py | 12 +++++++++++ .../interop/numpy/test_to_numpy_series.py | 20 ++----------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py index 16a1bb89862e..4419ed71ce24 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py @@ -1,5 +1,6 @@ from __future__ import annotations +from decimal import Decimal as D from typing import TYPE_CHECKING import numpy as np @@ -107,3 +108,14 @@ def test_numpy_preserve_uint64_4112() -> None: df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash()) assert df.to_numpy().dtype == np.dtype("uint64") assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")]) + + +@pytest.mark.parametrize("use_pyarrow", [True, False]) +def test_df_to_numpy_decimal(use_pyarrow: bool) -> None: + decimal_data = [D("1.234"), D("2.345"), D("-3.456")] + df = pl.Series("n", decimal_data).to_frame() + + result = df.to_numpy(use_pyarrow=use_pyarrow) + + expected = np.array(decimal_data).reshape((-1, 1)) + assert_array_equal(result, expected) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 506835fbd7ff..93b41f2d3eb7 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -90,13 +90,14 @@ def test_series_to_numpy_numeric_with_nulls( (pl.Enum(["a", "b", "c"]), ["a", "b", "a"]), (pl.String, ["a", "bc", "def"]), (pl.Binary, [b"a", b"bc", b"def"]), + (pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]), (pl.Object, [Path(), Path("abc")]), # TODO: Implement for List types # (pl.List, [[1], [2, 3]]), # (pl.List, [["a"], ["b", "c"], []]), ], ) -def test_to_numpy_various_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None: +def test_to_numpy_object_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None: values.append(None) s = pl.Series(values, dtype=dtype) result = s.to_numpy(use_pyarrow=False) @@ -346,20 +347,3 @@ def test_series_to_numpy_temporal() -> None: s4 = pl.Series([time(10, 30, 45), time(23, 59, 59)]) out = np.array([time(10, 30, 45), time(23, 59, 59)], dtype="object") assert (s4.to_numpy() == out).all() - - -@pytest.mark.parametrize("use_pyarrow", [True, False]) -def test_decimal_numpy_export(use_pyarrow: bool) -> None: - decimal_data = [D("1.234"), D("2.345"), D("-3.456")] - - s = pl.Series("n", decimal_data) - df = s.to_frame() - - assert_array_equal( - np.array(decimal_data), - s.to_numpy(use_pyarrow=use_pyarrow), - ) - assert_array_equal( - np.array(decimal_data).reshape((-1, 1)), - df.to_numpy(use_pyarrow=use_pyarrow), - ) From 3d2014032c9c1176ca287cb8a846b52e833e13cf Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 12:28:13 +0100 Subject: [PATCH 7/8] Use iter_str --- py-polars/src/series/export.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 4024e27e4f2d..bffdcef0772b 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -195,15 +195,9 @@ impl PySeries { let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); np_arr.into_py(py) }, - Categorical(rev_map, _) | Enum(rev_map, _) => { - let rev_map = rev_map.clone().unwrap(); - let mapping = &*rev_map; - let f = |idx: u32| mapping.get(idx); + Categorical(_, _) | Enum(_, _) => { let ca = s.categorical().unwrap(); - let np_arr = PyArray1::from_iter( - py, - ca.physical().into_iter().map(|s| s.map(f).into_py(py)), - ); + let np_arr = PyArray1::from_iter(py, ca.iter_str().map(|s| s.into_py(py))); np_arr.into_py(py) }, #[cfg(feature = "object")] From fe248bfc0b4be089b8733e643f5ccd93ad3f5f70 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 5 Feb 2024 13:29:02 +0100 Subject: [PATCH 8/8] More test refactor --- .../tests/unit/interop/numpy/test_numpy.py | 11 ++ .../interop/numpy/test_to_numpy_series.py | 185 ++++++++++-------- 2 files changed, 116 insertions(+), 80 deletions(-) diff --git a/py-polars/tests/unit/interop/numpy/test_numpy.py b/py-polars/tests/unit/interop/numpy/test_numpy.py index 80f11c63c886..55391c56d556 100644 --- a/py-polars/tests/unit/interop/numpy/test_numpy.py +++ b/py-polars/tests/unit/interop/numpy/test_numpy.py @@ -76,3 +76,14 @@ def test_to_numpy_zero_copy_path() -> None: x = df.to_numpy() assert x.flags["F_CONTIGUOUS"] assert str(x[0, :]) == "[1. 2. 1. 1. 1.]" + + +def test_numpy_disambiguation() -> None: + a = np.array([1, 2]) + df = pl.DataFrame({"a": a}) + result = df.with_columns(b=a).to_dict(as_series=False) # type: ignore[arg-type] + expected = { + "a": [1, 2], + "b": [1, 2], + } + assert result == expected diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 93b41f2d3eb7..0cf505dc39a9 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import datetime, time, timedelta +from datetime import date, datetime, time, timedelta from decimal import Decimal as D from pathlib import Path from typing import TYPE_CHECKING, Any @@ -48,7 +48,7 @@ def assert_zero_copy_only_raises(s: pl.Series) -> None: def test_series_to_numpy_numeric_zero_copy( dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike ) -> None: - s = pl.Series([1, 2, 3], dtype=dtype, strict=False) + s = pl.Series([1, 2, 3]).cast(dtype) # =dtype, strict=False) result = s.to_numpy(use_pyarrow=False, zero_copy_only=True) assert_zero_copy(s, result) @@ -83,9 +83,79 @@ def test_series_to_numpy_numeric_with_nulls( assert_zero_copy_only_raises(s) +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Duration, np.dtype("timedelta64[us]")), + (pl.Duration("ms"), np.dtype("timedelta64[ms]")), + (pl.Duration("us"), np.dtype("timedelta64[us]")), + (pl.Duration("ns"), np.dtype("timedelta64[ns]")), + (pl.Datetime, np.dtype("datetime64[us]")), + (pl.Datetime("ms"), np.dtype("datetime64[ms]")), + (pl.Datetime("us"), np.dtype("datetime64[us]")), + (pl.Datetime("ns"), np.dtype("datetime64[ns]")), + ], +) +def test_series_to_numpy_temporal_zero_copy( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + values = [0, 2_000, 1_000_000] + s = pl.Series(values, dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False, zero_copy_only=True) + + assert_zero_copy(s, result) + # NumPy tolist returns integers for ns precision + if s.dtype.time_unit == "ns": # type: ignore[attr-defined] + assert result.tolist() == values + else: + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + + +def test_series_to_numpy_date() -> None: + values = [date(1970, 1, 1), date(2024, 2, 28)] + s = pl.Series(values) + + result = s.to_numpy(use_pyarrow=False) + + assert s.to_list() == result.tolist() + assert result.dtype == np.dtype("datetime64[D]") + assert_zero_copy_only_raises(s) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Date, np.dtype("datetime64[D]")), + (pl.Duration("ms"), np.dtype("timedelta64[ms]")), + (pl.Duration("us"), np.dtype("timedelta64[us]")), + (pl.Duration("ns"), np.dtype("timedelta64[ns]")), + (pl.Datetime, np.dtype("datetime64[us]")), + (pl.Datetime("ms"), np.dtype("datetime64[ms]")), + (pl.Datetime("us"), np.dtype("datetime64[us]")), + (pl.Datetime("ns"), np.dtype("datetime64[ns]")), + ], +) +def test_series_to_numpy_temporal_with_nulls( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + values = [0, 2_000, 1_000_000, None] + s = pl.Series(values, dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + + # NumPy tolist returns integers for ns precision + if getattr(s.dtype, "time_unit", None) == "ns": + assert result.tolist() == values + else: + assert result.tolist() == s.to_list() + assert result.dtype == expected_dtype + assert_zero_copy_only_raises(s) + + @pytest.mark.parametrize( ("dtype", "values"), [ + (pl.Time, [time(10, 30, 45), time(23, 59, 59)]), (pl.Categorical, ["a", "b", "a"]), (pl.Enum(["a", "b", "c"]), ["a", "b", "a"]), (pl.String, ["a", "bc", "def"]), @@ -97,8 +167,13 @@ def test_series_to_numpy_numeric_with_nulls( # (pl.List, [["a"], ["b", "c"], []]), ], ) -def test_to_numpy_object_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None: - values.append(None) +@pytest.mark.parametrize("with_nulls", [False, True]) +def test_to_numpy_object_dtypes( + dtype: pl.PolarsDataType, values: list[Any], with_nulls: bool +) -> None: + if with_nulls: + values.append(None) + s = pl.Series(values, dtype=dtype) result = s.to_numpy(use_pyarrow=False) @@ -174,6 +249,32 @@ def test_to_numpy_empty() -> None: assert result.size == 0 +def test_series_to_numpy_temporal() -> None: + s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date) + s1 = pl.Series( + "datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)] + ) + s2 = pl.datetime_range( + datetime(2021, 1, 1, 0), + datetime(2021, 1, 1, 1), + interval="1h", + time_unit="ms", + eager=True, + ) + assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']" + assert ( + str(s1.to_numpy()[:2]) + == "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']" + ) + assert ( + str(s2.to_numpy()[:2]) + == "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']" + ) + s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)]) + out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]") + assert (s3.to_numpy() == out).all() + + @given( s=series( min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct] @@ -202,18 +303,6 @@ def test_series_to_numpy(s: pl.Series) -> None: assert_array_equal(result, expected) -@pytest.mark.parametrize("use_pyarrow", [True, False]) -@pytest.mark.parametrize("has_null", [True, False]) -@pytest.mark.parametrize("dtype", [pl.Time, pl.Boolean, pl.String]) -def test_to_numpy_no_zero_copy( - use_pyarrow: bool, has_null: bool, dtype: pl.PolarsDataType -) -> None: - data: list[Any] = ["a", None] if dtype == pl.String else [0, None] - series = pl.Series(data if has_null else data[:1], dtype=dtype) - with pytest.raises(ValueError): - series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow) - - @pytest.mark.parametrize("writable", [False, True]) @pytest.mark.parametrize("pyarrow_available", [False, True]) def test_to_numpy2( @@ -283,67 +372,3 @@ def test_view_deprecated() -> None: result = s.view() assert isinstance(result, np.ndarray) assert np.all(result == np.array([1.0, 2.5, 3.0])) - - -def test_numpy_disambiguation() -> None: - a = np.array([1, 2]) - df = pl.DataFrame({"a": a}) - result = df.with_columns(b=a).to_dict(as_series=False) # type: ignore[arg-type] - expected = { - "a": [1, 2], - "b": [1, 2], - } - assert result == expected - - -def test_to_numpy_datelike() -> None: - s = pl.Series( - "dt", - [ - datetime(2022, 7, 5, 10, 30, 45, 123456), - None, - datetime(2023, 2, 5, 15, 22, 30, 987654), - ], - ) - assert str(s.to_numpy()) == str( - np.array( - ["2022-07-05T10:30:45.123456", "NaT", "2023-02-05T15:22:30.987654"], - dtype="datetime64[us]", - ) - ) - assert str(s.drop_nulls().to_numpy()) == str( - np.array( - ["2022-07-05T10:30:45.123456", "2023-02-05T15:22:30.987654"], - dtype="datetime64[us]", - ) - ) - - -def test_series_to_numpy_temporal() -> None: - s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date) - s1 = pl.Series( - "datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)] - ) - s2 = pl.datetime_range( - datetime(2021, 1, 1, 0), - datetime(2021, 1, 1, 1), - interval="1h", - time_unit="ms", - eager=True, - ) - assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']" - assert ( - str(s1.to_numpy()[:2]) - == "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']" - ) - assert ( - str(s2.to_numpy()[:2]) - == "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']" - ) - s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)]) - out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]") - assert (s3.to_numpy() == out).all() - - s4 = pl.Series([time(10, 30, 45), time(23, 59, 59)]) - out = np.array([time(10, 30, 45), time(23, 59, 59)], dtype="object") - assert (s4.to_numpy() == out).all()