From 5e6a433acf9edaeed17d6ad853002e43b536d986 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 21 May 2024 12:34:06 +0200 Subject: [PATCH] refactor(python): Move `DataFrame.to_numpy` implementation to Rust side (#16354) --- py-polars/polars/dataframe/frame.py | 31 +-- py-polars/src/interop/numpy/to_numpy_df.rs | 229 ++++++++++++------ .../src/interop/numpy/to_numpy_series.rs | 19 +- .../unit/interop/numpy/test_to_numpy_df.py | 22 +- .../interop/numpy/test_to_numpy_series.py | 2 +- 5 files changed, 195 insertions(+), 108 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 085ed0d259e0..207a826a8b6c 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1565,15 +1565,11 @@ def to_numpy( rec.array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')], dtype=[('foo', 'u1'), ('bar', ' None: + if structured: if not allow_copy and not self.is_empty(): - msg = f"copy not allowed: {msg}" + msg = "copy not allowed: cannot create structured array without copying data" raise RuntimeError(msg) - if structured: - raise_on_copy("cannot create structured array without copying data") - arrays = [] struct_dtype = [] for s in self.iter_columns(): @@ -1588,28 +1584,7 @@ def raise_on_copy(msg: str) -> None: out[c] = arrays[idx] return out - if order == "fortran": - array = self._df.to_numpy_view() - if array is not None: - if writable and not array.flags.writeable: - raise_on_copy("cannot create writable array without copying data") - array = array.copy() - return array - - raise_on_copy( - "only numeric data without nulls in Fortran-like order can be converted without copy" - ) - - out = self._df.to_numpy(order) - if out is None: - return np.vstack( - [ - self.to_series(i).to_numpy(use_pyarrow=use_pyarrow) - for i in range(self.width) - ] - ).T - - return out + return self._df.to_numpy(order, writable=writable, allow_copy=allow_copy) @overload def to_jax( diff --git a/py-polars/src/interop/numpy/to_numpy_df.rs b/py-polars/src/interop/numpy/to_numpy_df.rs index 075119c85a5c..f65113722d52 100644 --- a/py-polars/src/interop/numpy/to_numpy_df.rs +++ b/py-polars/src/interop/numpy/to_numpy_df.rs @@ -4,97 +4,182 @@ use numpy::{Element, IntoPyArray}; use polars_core::prelude::*; use polars_core::utils::dtypes_to_supertype; use polars_core::with_match_physical_numeric_polars_type; +use pyo3::exceptions::PyRuntimeError; +use pyo3::intern; use pyo3::prelude::*; +use pyo3::types::PyList; +use super::to_numpy_series::series_to_numpy; use super::utils::create_borrowed_np_array; use crate::conversion::Wrap; use crate::dataframe::PyDataFrame; #[pymethods] -#[allow(clippy::wrong_self_convention)] impl PyDataFrame { + /// Convert this DataFrame to a NumPy ndarray. + fn to_numpy( + &self, + py: Python, + order: Wrap, + writable: bool, + allow_copy: bool, + ) -> PyResult { + df_to_numpy(py, &self.df, order.0, writable, allow_copy) + } + /// Create a view of the data as a NumPy ndarray. /// /// WARNING: The resulting view will show the underlying value for nulls, /// which may be any value. The caller is responsible for handling nulls /// appropriately. - pub fn to_numpy_view(&self, py: Python) -> Option { - if self.df.is_empty() { - return None; - } - let first = self.df.get_columns().first().unwrap().dtype(); - // TODO: Support Datetime/Duration types - if !first.is_numeric() { - return None; - } - if !self - .df - .get_columns() - .iter() - .all(|s| s.null_count() == 0 && s.dtype() == first && s.chunks().len() == 1) - { - return None; - } + fn to_numpy_view(&self, py: Python) -> Option { + try_df_to_numpy_view(py, &self.df) + } +} - let owner = self.clone().into_py(py); // Keep the DataFrame memory alive. - - fn get_ptr(py: Python, columns: &[Series], owner: PyObject) -> Option - where - T: PolarsNumericType, - T::Native: Element, - { - let slices = columns - .iter() - .map(|s| { - let ca: &ChunkedArray = s.unpack().unwrap(); - ca.cont_slice().unwrap() - }) - .collect::>(); - - let first = slices.first().unwrap(); - unsafe { - let mut end_ptr = first.as_ptr().add(first.len()); - // Check if all arrays are from the same buffer - let all_contiguous = slices[1..].iter().all(|slice| { - let valid = slice.as_ptr() == end_ptr; - - end_ptr = slice.as_ptr().add(slice.len()); - - valid - }); - - if all_contiguous { - let start_ptr = first.as_ptr(); - let dtype = T::Native::get_dtype_bound(py); - let dims = [first.len(), columns.len()].into_dimension(); - Some(create_borrowed_np_array::<_>( - py, - dtype, - dims, - flags::NPY_ARRAY_FARRAY_RO, - start_ptr as _, - owner, - )) - } else { - None +fn df_to_numpy( + py: Python, + df: &DataFrame, + order: IndexOrder, + writable: bool, + allow_copy: bool, +) -> PyResult { + // TODO: Use `is_empty` when fixed: + // https://github.com/pola-rs/polars/pull/16351 + if df.height() == 0 { + // Take this path to ensure a writable array. + // This does not actually copy data for an empty DataFrame. + return df_to_numpy_with_copy(py, df, order, true); + } + + if matches!(order, IndexOrder::Fortran) { + if let Some(mut arr) = try_df_to_numpy_view(py, df) { + if writable { + if !allow_copy { + return Err(PyRuntimeError::new_err( + "copy not allowed: cannot create a writable array without copying data", + )); } + arr = arr.call_method0(py, intern!(py, "copy"))?; } + return Ok(arr); } - with_match_physical_numeric_polars_type!(first, |$T| { - get_ptr::<$T>(py, self.df.get_columns(), owner) + } + + if !allow_copy { + return Err(PyRuntimeError::new_err( + "copy not allowed: cannot convert to a NumPy array without copying data", + )); + } + + df_to_numpy_with_copy(py, df, order, writable) +} + +fn try_df_to_numpy_view(py: Python, df: &DataFrame) -> Option { + if df.is_empty() { + return None; + } + let first = df.get_columns().first().unwrap().dtype(); + // TODO: Support Datetime/Duration/Array types + if !first.is_numeric() { + return None; + } + if !df + .get_columns() + .iter() + .all(|s| s.null_count() == 0 && s.dtype() == first && s.chunks().len() == 1) + { + return None; + } + + let owner = PyDataFrame::from(df.clone()).into_py(py); // Keep the DataFrame memory alive. + + with_match_physical_numeric_polars_type!(first, |$T| { + get_ptr::<$T>(py, df.get_columns(), owner) + }) +} +fn get_ptr(py: Python, columns: &[Series], owner: PyObject) -> Option +where + T: PolarsNumericType, + T::Native: Element, +{ + let slices = columns + .iter() + .map(|s| { + let ca: &ChunkedArray = s.unpack().unwrap(); + ca.cont_slice().unwrap() }) + .collect::>(); + + let first = slices.first().unwrap(); + unsafe { + let mut end_ptr = first.as_ptr().add(first.len()); + // Check if all arrays are from the same buffer + let all_contiguous = slices[1..].iter().all(|slice| { + let valid = slice.as_ptr() == end_ptr; + + end_ptr = slice.as_ptr().add(slice.len()); + + valid + }); + + if all_contiguous { + let start_ptr = first.as_ptr(); + let dtype = T::Native::get_dtype_bound(py); + let dims = [first.len(), columns.len()].into_dimension(); + Some(create_borrowed_np_array::<_>( + py, + dtype, + dims, + flags::NPY_ARRAY_FARRAY_RO, + start_ptr as _, + owner, + )) + } else { + None + } } +} - /// Convert this DataFrame to a NumPy ndarray. - pub fn to_numpy(&self, py: Python, order: Wrap) -> Option { - let st = dtypes_to_supertype(self.df.iter().map(|s| s.dtype())).ok()?; - - let pyarray = match st { - dt if dt.is_numeric() => with_match_physical_numeric_polars_type!(dt, |$T| { - self.df.to_ndarray::<$T>(order.0).ok()?.into_pyarray_bound(py).into_py(py) - }), - _ => return None, - }; - Some(pyarray) +fn df_to_numpy_with_copy( + py: Python, + df: &DataFrame, + order: IndexOrder, + writable: bool, +) -> PyResult { + if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) { + Ok(arr) + } else { + df_columns_to_numpy(py, df, writable) } } +fn try_df_to_numpy_numeric_supertype( + py: Python, + df: &DataFrame, + order: IndexOrder, +) -> Option { + let st = dtypes_to_supertype(df.iter().map(|s| s.dtype())).ok()?; + + let np_array = match st { + dt if dt.is_numeric() => with_match_physical_numeric_polars_type!(dt, |$T| { + df.to_ndarray::<$T>(order).ok()?.into_pyarray_bound(py).into_py(py) + }), + _ => return None, + }; + Some(np_array) +} +fn df_columns_to_numpy(py: Python, df: &DataFrame, writable: bool) -> PyResult { + let np_arrays = df + .iter() + .map(|s| series_to_numpy(py, s, writable, true).unwrap()); + + // TODO: Handle multidimensional column arrays + + let numpy = PyModule::import_bound(py, "numpy")?; + let arr = numpy + .getattr(intern!(py, "vstack")) + .unwrap() + .call1((PyList::new_bound(py, np_arrays),))? + .getattr(intern!(py, "T"))?; + Ok(arr.into()) +} diff --git a/py-polars/src/interop/numpy/to_numpy_series.rs b/py-polars/src/interop/numpy/to_numpy_series.rs index df04e3890c94..ac8b52a5ce56 100644 --- a/py-polars/src/interop/numpy/to_numpy_series.rs +++ b/py-polars/src/interop/numpy/to_numpy_series.rs @@ -4,7 +4,7 @@ use numpy::npyffi::flags; use numpy::{Element, PyArray1}; use polars_core::prelude::*; use polars_core::with_match_physical_numeric_polars_type; -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::PyRuntimeError; use pyo3::intern; use pyo3::prelude::*; use pyo3::types::PySlice; @@ -41,17 +41,22 @@ impl PySeries { } /// Convert a Series to a NumPy ndarray. -fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> PyResult { +pub(super) fn series_to_numpy( + py: Python, + s: &Series, + writable: bool, + allow_copy: bool, +) -> PyResult { if s.is_empty() { // Take this path to ensure a writable array. - // This does not actually copy data for empty Series. + // This does not actually copy data for an empty Series. return series_to_numpy_with_copy(py, s, true); } if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) { if writable && !writable_flag { if !allow_copy { - return Err(PyValueError::new_err( - "cannot return a zero-copy writable array", + return Err(PyRuntimeError::new_err( + "copy not allowed: cannot create a writable array without copying data", )); } arr = arr.call_method0(py, intern!(py, "copy"))?; @@ -60,7 +65,9 @@ fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> } if !allow_copy { - return Err(PyValueError::new_err("cannot return a zero-copy array")); + return Err(PyRuntimeError::new_err( + "copy not allowed: cannot convert to a NumPy array without copying data", + )); } series_to_numpy_with_copy(py, s, writable) diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py index 68b1f7696dd7..3e3eb5826e2c 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py @@ -10,6 +10,8 @@ import polars as pl if TYPE_CHECKING: + import numpy.typing as npt + from polars.type_aliases import IndexOrder @@ -152,7 +154,7 @@ def test_df_to_numpy_structured_not_zero_copy() -> None: def test_df_to_numpy_writable_not_zero_copy() -> None: df = pl.DataFrame({"a": [1, 2]}) - msg = "cannot create writable array without copying data" + msg = "copy not allowed: cannot create a writable array without copying data" with pytest.raises(RuntimeError, match=msg): df.to_numpy(allow_copy=False, writable=True) @@ -161,3 +163,21 @@ def test_df_to_numpy_not_zero_copy() -> None: df = pl.DataFrame({"a": [1, 2, None]}) with pytest.raises(RuntimeError): df.to_numpy(allow_copy=False) + + +@pytest.mark.parametrize( + ("schema", "expected_dtype"), + [ + ({"a": pl.Int8, "b": pl.Int8}, np.int8), + ({"a": pl.Int8, "b": pl.UInt16}, np.int32), + ({"a": pl.Int8, "b": pl.String}, np.object_), + ], +) +def test_df_to_numpy_empty_dtype_viewable( + schema: dict[str, pl.PolarsDataType], expected_dtype: npt.DTypeLike +) -> None: + df = pl.DataFrame(schema=schema) + result = df.to_numpy(allow_copy=False) + assert result.shape == (0, 2) + assert result.dtype == expected_dtype + assert result.flags.writeable is True diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py index 501d127e6c7b..f757457a5ea9 100644 --- a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py +++ b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py @@ -26,7 +26,7 @@ def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None: def assert_allow_copy_false_raises(s: pl.Series) -> None: - with pytest.raises(ValueError, match="cannot return a zero-copy array"): + with pytest.raises(RuntimeError, match="copy not allowed"): s.to_numpy(use_pyarrow=False, allow_copy=False)