Skip to content

Commit

Permalink
fix: deal with partials in PyArrow record batches
Browse files Browse the repository at this point in the history
This fixes the slicing behavior of FixedSizeLists when loaded with PyArrow. I am not sure if this behavior is also faulty at other places (I especially suspect structs), but as long as there are no reported problems there I think this fix is okay for now.

Fixes pola-rs#16614.
  • Loading branch information
coastalwhite committed Jun 19, 2024
1 parent d429c91 commit 797ee71
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 1 deletion.
4 changes: 3 additions & 1 deletion crates/polars-arrow/src/array/fixed_size_list/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ impl<A: ffi::ArrowArrayRef> FromFfi<A> for FixedSizeListArray {
let child = unsafe { array.child(0)? };
let values = ffi::try_from(child)?;

Self::try_new(data_type, values, validity)
let mut fsl = Self::try_new(data_type, values, validity)?;
fsl.slice(array.offset(), array.length());
Ok(fsl)
}
}
19 changes: 19 additions & 0 deletions crates/polars-arrow/src/ffi/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,9 @@ pub trait ArrowArrayRef: std::fmt::Debug {

fn n_buffers(&self) -> usize;

fn offset(&self) -> usize;
fn length(&self) -> usize;

fn parent(&self) -> &InternalArrowArray;
fn array(&self) -> &ArrowArray;
fn data_type(&self) -> &ArrowDataType;
Expand Down Expand Up @@ -602,6 +605,14 @@ impl ArrowArrayRef for InternalArrowArray {
fn n_buffers(&self) -> usize {
self.array.n_buffers as usize
}

fn offset(&self) -> usize {
self.array.offset as usize
}

fn length(&self) -> usize {
self.array.length as usize
}
}

#[derive(Debug)]
Expand All @@ -628,6 +639,14 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> {
fn n_buffers(&self) -> usize {
self.array.n_buffers as usize
}

fn offset(&self) -> usize {
self.array.offset as usize
}

fn length(&self) -> usize {
self.array.length as usize
}
}

impl<'a> ArrowArrayChild<'a> {
Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1115,3 +1115,18 @@ def test_parquet_statistics_uint64_16683() -> None:

assert statistics.min == 0
assert statistics.max == u64_max


def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> None:
filename = tmp_path / "a.parquet"

n = 500000
x = pl.DataFrame({
'x': np.linspace((1, 2),(2 * n , 2 * n * 1),n, dtype=np.float32)
}, schema={ 'x': pl.Array(pl.Float32, 2) })

x.write_parquet(filename)
b = pl.read_parquet(filename, use_pyarrow=True)

assert b['x'].shape[0] == n
assert_frame_equal(b, x)

0 comments on commit 797ee71

Please sign in to comment.