Skip to content

Commit

Permalink
refactor(python): Move DataFrame.to_numpy implementation to Rust si…
Browse files Browse the repository at this point in the history
…de (#16354)
  • Loading branch information
stinodego authored May 21, 2024
1 parent e60ea16 commit 5e6a433
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 108 deletions.
31 changes: 3 additions & 28 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1565,15 +1565,11 @@ def to_numpy(
rec.array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])
"""

def raise_on_copy(msg: str) -> None:
if structured:
if not allow_copy and not self.is_empty():
msg = f"copy not allowed: {msg}"
msg = "copy not allowed: cannot create structured array without copying data"
raise RuntimeError(msg)

if structured:
raise_on_copy("cannot create structured array without copying data")

arrays = []
struct_dtype = []
for s in self.iter_columns():
Expand All @@ -1588,28 +1584,7 @@ def raise_on_copy(msg: str) -> None:
out[c] = arrays[idx]
return out

if order == "fortran":
array = self._df.to_numpy_view()
if array is not None:
if writable and not array.flags.writeable:
raise_on_copy("cannot create writable array without copying data")
array = array.copy()
return array

raise_on_copy(
"only numeric data without nulls in Fortran-like order can be converted without copy"
)

out = self._df.to_numpy(order)
if out is None:
return np.vstack(
[
self.to_series(i).to_numpy(use_pyarrow=use_pyarrow)
for i in range(self.width)
]
).T

return out
return self._df.to_numpy(order, writable=writable, allow_copy=allow_copy)

@overload
def to_jax(
Expand Down
229 changes: 157 additions & 72 deletions py-polars/src/interop/numpy/to_numpy_df.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,97 +4,182 @@ use numpy::{Element, IntoPyArray};
use polars_core::prelude::*;
use polars_core::utils::dtypes_to_supertype;
use polars_core::with_match_physical_numeric_polars_type;
use pyo3::exceptions::PyRuntimeError;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::PyList;

use super::to_numpy_series::series_to_numpy;
use super::utils::create_borrowed_np_array;
use crate::conversion::Wrap;
use crate::dataframe::PyDataFrame;

#[pymethods]
#[allow(clippy::wrong_self_convention)]
impl PyDataFrame {
/// Convert this DataFrame to a NumPy ndarray.
fn to_numpy(
&self,
py: Python,
order: Wrap<IndexOrder>,
writable: bool,
allow_copy: bool,
) -> PyResult<PyObject> {
df_to_numpy(py, &self.df, order.0, writable, allow_copy)
}

/// Create a view of the data as a NumPy ndarray.
///
/// WARNING: The resulting view will show the underlying value for nulls,
/// which may be any value. The caller is responsible for handling nulls
/// appropriately.
pub fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
if self.df.is_empty() {
return None;
}
let first = self.df.get_columns().first().unwrap().dtype();
// TODO: Support Datetime/Duration types
if !first.is_numeric() {
return None;
}
if !self
.df
.get_columns()
.iter()
.all(|s| s.null_count() == 0 && s.dtype() == first && s.chunks().len() == 1)
{
return None;
}
fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
try_df_to_numpy_view(py, &self.df)
}
}

let owner = self.clone().into_py(py); // Keep the DataFrame memory alive.

fn get_ptr<T>(py: Python, columns: &[Series], owner: PyObject) -> Option<PyObject>
where
T: PolarsNumericType,
T::Native: Element,
{
let slices = columns
.iter()
.map(|s| {
let ca: &ChunkedArray<T> = s.unpack().unwrap();
ca.cont_slice().unwrap()
})
.collect::<Vec<_>>();

let first = slices.first().unwrap();
unsafe {
let mut end_ptr = first.as_ptr().add(first.len());
// Check if all arrays are from the same buffer
let all_contiguous = slices[1..].iter().all(|slice| {
let valid = slice.as_ptr() == end_ptr;

end_ptr = slice.as_ptr().add(slice.len());

valid
});

if all_contiguous {
let start_ptr = first.as_ptr();
let dtype = T::Native::get_dtype_bound(py);
let dims = [first.len(), columns.len()].into_dimension();
Some(create_borrowed_np_array::<_>(
py,
dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
start_ptr as _,
owner,
))
} else {
None
fn df_to_numpy(
py: Python,
df: &DataFrame,
order: IndexOrder,
writable: bool,
allow_copy: bool,
) -> PyResult<PyObject> {
// TODO: Use `is_empty` when fixed:
// https://github.com/pola-rs/polars/pull/16351
if df.height() == 0 {
// Take this path to ensure a writable array.
// This does not actually copy data for an empty DataFrame.
return df_to_numpy_with_copy(py, df, order, true);
}

if matches!(order, IndexOrder::Fortran) {
if let Some(mut arr) = try_df_to_numpy_view(py, df) {
if writable {
if !allow_copy {
return Err(PyRuntimeError::new_err(
"copy not allowed: cannot create a writable array without copying data",
));
}
arr = arr.call_method0(py, intern!(py, "copy"))?;
}
return Ok(arr);
}
with_match_physical_numeric_polars_type!(first, |$T| {
get_ptr::<$T>(py, self.df.get_columns(), owner)
}

if !allow_copy {
return Err(PyRuntimeError::new_err(
"copy not allowed: cannot convert to a NumPy array without copying data",
));
}

df_to_numpy_with_copy(py, df, order, writable)
}

fn try_df_to_numpy_view(py: Python, df: &DataFrame) -> Option<PyObject> {
if df.is_empty() {
return None;
}
let first = df.get_columns().first().unwrap().dtype();
// TODO: Support Datetime/Duration/Array types
if !first.is_numeric() {
return None;
}
if !df
.get_columns()
.iter()
.all(|s| s.null_count() == 0 && s.dtype() == first && s.chunks().len() == 1)
{
return None;
}

let owner = PyDataFrame::from(df.clone()).into_py(py); // Keep the DataFrame memory alive.

with_match_physical_numeric_polars_type!(first, |$T| {
get_ptr::<$T>(py, df.get_columns(), owner)
})
}
fn get_ptr<T>(py: Python, columns: &[Series], owner: PyObject) -> Option<PyObject>
where
T: PolarsNumericType,
T::Native: Element,
{
let slices = columns
.iter()
.map(|s| {
let ca: &ChunkedArray<T> = s.unpack().unwrap();
ca.cont_slice().unwrap()
})
.collect::<Vec<_>>();

let first = slices.first().unwrap();
unsafe {
let mut end_ptr = first.as_ptr().add(first.len());
// Check if all arrays are from the same buffer
let all_contiguous = slices[1..].iter().all(|slice| {
let valid = slice.as_ptr() == end_ptr;

end_ptr = slice.as_ptr().add(slice.len());

valid
});

if all_contiguous {
let start_ptr = first.as_ptr();
let dtype = T::Native::get_dtype_bound(py);
let dims = [first.len(), columns.len()].into_dimension();
Some(create_borrowed_np_array::<_>(
py,
dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
start_ptr as _,
owner,
))
} else {
None
}
}
}

/// Convert this DataFrame to a NumPy ndarray.
pub fn to_numpy(&self, py: Python, order: Wrap<IndexOrder>) -> Option<PyObject> {
let st = dtypes_to_supertype(self.df.iter().map(|s| s.dtype())).ok()?;

let pyarray = match st {
dt if dt.is_numeric() => with_match_physical_numeric_polars_type!(dt, |$T| {
self.df.to_ndarray::<$T>(order.0).ok()?.into_pyarray_bound(py).into_py(py)
}),
_ => return None,
};
Some(pyarray)
fn df_to_numpy_with_copy(
py: Python,
df: &DataFrame,
order: IndexOrder,
writable: bool,
) -> PyResult<PyObject> {
if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
Ok(arr)
} else {
df_columns_to_numpy(py, df, writable)
}
}
fn try_df_to_numpy_numeric_supertype(
py: Python,
df: &DataFrame,
order: IndexOrder,
) -> Option<PyObject> {
let st = dtypes_to_supertype(df.iter().map(|s| s.dtype())).ok()?;

let np_array = match st {
dt if dt.is_numeric() => with_match_physical_numeric_polars_type!(dt, |$T| {
df.to_ndarray::<$T>(order).ok()?.into_pyarray_bound(py).into_py(py)
}),
_ => return None,
};
Some(np_array)
}
fn df_columns_to_numpy(py: Python, df: &DataFrame, writable: bool) -> PyResult<PyObject> {
let np_arrays = df
.iter()
.map(|s| series_to_numpy(py, s, writable, true).unwrap());

// TODO: Handle multidimensional column arrays

let numpy = PyModule::import_bound(py, "numpy")?;
let arr = numpy
.getattr(intern!(py, "vstack"))
.unwrap()
.call1((PyList::new_bound(py, np_arrays),))?
.getattr(intern!(py, "T"))?;
Ok(arr.into())
}
19 changes: 13 additions & 6 deletions py-polars/src/interop/numpy/to_numpy_series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use numpy::npyffi::flags;
use numpy::{Element, PyArray1};
use polars_core::prelude::*;
use polars_core::with_match_physical_numeric_polars_type;
use pyo3::exceptions::PyValueError;
use pyo3::exceptions::PyRuntimeError;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::PySlice;
Expand Down Expand Up @@ -41,17 +41,22 @@ impl PySeries {
}

/// Convert a Series to a NumPy ndarray.
fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) -> PyResult<PyObject> {
pub(super) fn series_to_numpy(
py: Python,
s: &Series,
writable: bool,
allow_copy: bool,
) -> PyResult<PyObject> {
if s.is_empty() {
// Take this path to ensure a writable array.
// This does not actually copy data for empty Series.
// This does not actually copy data for an empty Series.
return series_to_numpy_with_copy(py, s, true);
}
if let Some((mut arr, writable_flag)) = try_series_to_numpy_view(py, s, false, allow_copy) {
if writable && !writable_flag {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
return Err(PyRuntimeError::new_err(
"copy not allowed: cannot create a writable array without copying data",
));
}
arr = arr.call_method0(py, intern!(py, "copy"))?;
Expand All @@ -60,7 +65,9 @@ fn series_to_numpy(py: Python, s: &Series, writable: bool, allow_copy: bool) ->
}

if !allow_copy {
return Err(PyValueError::new_err("cannot return a zero-copy array"));
return Err(PyRuntimeError::new_err(
"copy not allowed: cannot convert to a NumPy array without copying data",
));
}

series_to_numpy_with_copy(py, s, writable)
Expand Down
22 changes: 21 additions & 1 deletion py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import polars as pl

if TYPE_CHECKING:
import numpy.typing as npt

from polars.type_aliases import IndexOrder


Expand Down Expand Up @@ -152,7 +154,7 @@ def test_df_to_numpy_structured_not_zero_copy() -> None:

def test_df_to_numpy_writable_not_zero_copy() -> None:
df = pl.DataFrame({"a": [1, 2]})
msg = "cannot create writable array without copying data"
msg = "copy not allowed: cannot create a writable array without copying data"
with pytest.raises(RuntimeError, match=msg):
df.to_numpy(allow_copy=False, writable=True)

Expand All @@ -161,3 +163,21 @@ def test_df_to_numpy_not_zero_copy() -> None:
df = pl.DataFrame({"a": [1, 2, None]})
with pytest.raises(RuntimeError):
df.to_numpy(allow_copy=False)


@pytest.mark.parametrize(
("schema", "expected_dtype"),
[
({"a": pl.Int8, "b": pl.Int8}, np.int8),
({"a": pl.Int8, "b": pl.UInt16}, np.int32),
({"a": pl.Int8, "b": pl.String}, np.object_),
],
)
def test_df_to_numpy_empty_dtype_viewable(
schema: dict[str, pl.PolarsDataType], expected_dtype: npt.DTypeLike
) -> None:
df = pl.DataFrame(schema=schema)
result = df.to_numpy(allow_copy=False)
assert result.shape == (0, 2)
assert result.dtype == expected_dtype
assert result.flags.writeable is True
2 changes: 1 addition & 1 deletion py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:


def assert_allow_copy_false_raises(s: pl.Series) -> None:
with pytest.raises(ValueError, match="cannot return a zero-copy array"):
with pytest.raises(RuntimeError, match="copy not allowed"):
s.to_numpy(use_pyarrow=False, allow_copy=False)


Expand Down

0 comments on commit 5e6a433

Please sign in to comment.