Skip to content

Commit

Permalink
Merge pull request #249 from chmp/fix/sliced-lists
Browse files Browse the repository at this point in the history
Incorrect deserialization from sliced arrays
  • Loading branch information
chmp authored Oct 26, 2024
2 parents c57f208 + 43327f9 commit 32c1e2d
Show file tree
Hide file tree
Showing 13 changed files with 381 additions and 15 deletions.
14 changes: 14 additions & 0 deletions Changes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Change log

## 0.12.2

Bug fixes:

- Fixed deserialization from sliced arrays ([#248](https://github.com/chmp/serde_arrow/issues/248)).
Note that the current solution requires up front work when constructing the array deserializers,
as described in the issue. The removal of the performance penalty is tracked in
([#250](https://github.com/chmp/serde_arrow/issues/250))

### Thanks

- [@jkylling](https://github.com/jkylling) for reporting
([#248](https://github.com/chmp/serde_arrow/issues/248)) and for discussing potential solutions

## 0.12.1

New features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,12 @@ impl<'a> ArrayDeserializer<'a> {
fields.push((field_meta.name, field_deserializer))
}

Ok(Self::Enum(EnumDeserializer::new(path, view.types, fields)))
Ok(Self::Enum(EnumDeserializer::new(
path,
view.types,
view.offsets,
fields,
)?))
}
}
}
Expand Down
73 changes: 69 additions & 4 deletions serde_arrow/src/internal/deserialization/enum_deserializer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};

use serde::de::{DeserializeSeed, Deserializer, EnumAccess, Visitor};

Expand All @@ -20,15 +20,80 @@ impl<'a> EnumDeserializer<'a> {
pub fn new(
path: String,
type_ids: &'a [i8],
variants: Vec<(String, ArrayDeserializer<'a>)>,
) -> Self {
Self {
offsets: &'a [i32],
mut variants: Vec<(String, ArrayDeserializer<'a>)>,
) -> Result<Self> {
let initial_offsets = verify_offsets(type_ids, offsets, variants.len())?;

for (type_id, initial_offset) in initial_offsets {
let Some((_, variant)) = variants.get_mut(type_id as usize) else {
fail!("Unexpected error: could not retrieve variant {type_id}");
};
variant.skip(initial_offset as usize)?;
}

Ok(Self {
path,
type_ids,
variants,
next: 0,
})
}
}

fn verify_offsets(type_ids: &[i8], offsets: &[i32], num_fields: usize) -> Result<HashMap<i8, i32>> {
if type_ids.len() != offsets.len() {
fail!("Offsets and type ids must have the same length")
}

for &type_id in type_ids {
if type_id as usize >= num_fields {
fail!(
concat!(
"Invalid enum array:",
"type id ({type_id}) larger the number of fields ({num_fields})",
),
type_id = type_id,
num_fields = num_fields,
);
}
}

let mut last_offsets = HashMap::<i8, i32>::new();
let mut initial_offsets = HashMap::<i8, i32>::new();
for (idx, (&type_id, &offset)) in std::iter::zip(type_ids, offsets).enumerate() {
if offset < 0 {
fail!(
concat!(
"Invalid offsets in enum array for item {idx}:",
"negative offsets ({offset}) is not supports",
),
idx = idx,
offset = offset,
);
}
if let Some(last_offset) = last_offsets.get(&type_id).copied() {
if offset.checked_sub(last_offset) != Some(1) {
fail!(
concat!(
"Invalid offsets in enum array for item {idx}:",
"serde_arrow only supports consecutive offsets.",
"Current offset for type {type_id}: {offset}, previous offset {last_offset}",
),
idx = idx,
type_id = type_id,
offset = offset,
last_offset = last_offset,
);
}
last_offsets.insert(type_id, offset);
} else {
initial_offsets.insert(type_id, offset);
last_offsets.insert(type_id, offset);
}
}

Ok(initial_offsets)
}

impl<'de> Context for EnumDeserializer<'de> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ pub struct ListDeserializer<'a, O: Offset> {
impl<'a, O: Offset> ListDeserializer<'a, O> {
pub fn new(
path: String,
item: ArrayDeserializer<'a>,
mut item: ArrayDeserializer<'a>,
offsets: &'a [O],
validity: Option<BitsWithOffset<'a>>,
) -> Result<Self> {
check_supported_list_layout(validity, offsets)?;
item.skip(offsets[0].try_into_usize()?)?;

Ok(Self {
path,
Expand Down
8 changes: 6 additions & 2 deletions serde_arrow/src/internal/deserialization/map_deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,17 @@ pub struct MapDeserializer<'a> {
impl<'a> MapDeserializer<'a> {
pub fn new(
path: String,
key: ArrayDeserializer<'a>,
value: ArrayDeserializer<'a>,
mut key: ArrayDeserializer<'a>,
mut value: ArrayDeserializer<'a>,
offsets: &'a [i32],
validity: Option<BitsWithOffset<'a>>,
) -> Result<Self> {
check_supported_list_layout(validity, offsets)?;

let initial_offset = usize::try_from(offsets[0])?;
key.skip(initial_offset)?;
value.skip(initial_offset)?;

Ok(Self {
path,
key: Box::new(key),
Expand Down
12 changes: 11 additions & 1 deletion serde_arrow/src/internal/deserialization/simple_deserializer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use serde::{de::Visitor, Deserializer};
use serde::{
de::{IgnoredAny, Visitor},
Deserializer,
};

use crate::internal::{
error::{fail, Context, Error, Result},
Expand All @@ -7,6 +10,13 @@ use crate::internal::{

#[allow(unused)]
pub trait SimpleDeserializer<'de>: Context + Sized {
fn skip(&mut self, n: usize) -> Result<()> {
for _ in 0..n {
self.deserialize_any(IgnoredAny)?;
}
Ok(())
}

fn deserialize_any<V: Visitor<'de>>(&mut self, visitor: V) -> Result<V::Value> {
fail!(in self, "Deserializer does not implement deserialize_any");
}
Expand Down
35 changes: 29 additions & 6 deletions serde_arrow/src/internal/deserialization/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,44 @@ pub fn check_supported_list_layout<'a, O: Offset>(
validity: Option<BitsWithOffset<'a>>,
offsets: &'a [O],
) -> Result<()> {
let Some(validity) = validity else {
return Ok(());
};

if offsets.is_empty() {
fail!("Unsupported: list offsets must be non empty");
}

for i in 0..offsets.len().saturating_sub(1) {
let curr = offsets[i].try_into_usize()?;
let next = offsets[i + 1].try_into_usize()?;
if !bitset_is_set(&validity, i)? && (next - curr) != 0 {
fail!("Unsupported: lists with data in null values are currently not supported in deserialization");

if next < curr {
fail!("Unsupported: list offsets are assumed to be monotonically increasing");
}
if let Some(validity) = validity.as_ref() {
if !bitset_is_set(&validity, i)? && (next - curr) != 0 {
fail!("Unsupported: lists with data in null values are currently not supported in deserialization");
}
}
}

Ok(())
}

#[test]
fn test_check_supported_list_layout() {
use crate::internal::testing::assert_error_contains;

assert_error_contains(&check_supported_list_layout::<i32>(None, &[]), "non empty");
assert_error_contains(
&check_supported_list_layout::<i32>(None, &[0, 1, 0]),
"monotonically increasing",
);
assert_error_contains(
&check_supported_list_layout::<i32>(
Some(BitsWithOffset {
offset: 0,
data: &[0b_101],
}),
&[0, 5, 10, 15],
),
"data in null values",
);
}
40 changes: 40 additions & 0 deletions serde_arrow/src/test/deserialization/list.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use crate::internal::{
arrow::{ArrayView, FieldMeta, ListArrayView, PrimitiveArrayView},
deserialization::array_deserializer::ArrayDeserializer,
testing::assert_error_contains,
};

#[test]
fn invalid_offsets() {
let reference = ListArrayView {
validity: None,
offsets: &[],
meta: FieldMeta {
name: String::from("element"),
nullable: false,
metadata: Default::default(),
},
element: Box::new(ArrayView::Int32(PrimitiveArrayView {
validity: None,
values: &[0, 1, 2, 3, 4, 5],
})),
};

let view = ArrayView::List(ListArrayView {
offsets: &[],
..reference.clone()
});
assert_error_contains(
&ArrayDeserializer::new(String::from("foo"), None, view),
"non empty",
);

let view = ArrayView::List(ListArrayView {
offsets: &[0, 5, 2],
..reference.clone()
});
assert_error_contains(
&ArrayDeserializer::new(String::from("foo"), None, view),
"monotonically increasing",
);
}
2 changes: 2 additions & 0 deletions serde_arrow/src/test/deserialization/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mod list;
mod r#union;
57 changes: 57 additions & 0 deletions serde_arrow/src/test/deserialization/union.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use crate::internal::{
arrow::{ArrayView, DenseUnionArrayView, FieldMeta, PrimitiveArrayView},
deserialization::array_deserializer::ArrayDeserializer,
testing::assert_error_contains,
};

#[test]
fn non_consecutive_offsets() {
let fields = vec![
(
0,
ArrayView::Int32(PrimitiveArrayView {
validity: None,
values: &[1, 2, 3, 4, 5, 6],
}),
FieldMeta {
name: String::from("foo"),
nullable: false,
metadata: Default::default(),
},
),
(
1,
ArrayView::Int32(PrimitiveArrayView {
validity: None,
values: &[1, 2, 3, 4, 5, 6],
}),
FieldMeta {
name: String::from("foo"),
nullable: false,
metadata: Default::default(),
},
),
];

// first type has an unused element
let view = ArrayView::DenseUnion(DenseUnionArrayView {
types: &[0, 0, 1],
offsets: &[0, 2, 0],
fields: fields.clone(),
});
assert_error_contains(
&ArrayDeserializer::new(String::from("foo"), None, view),
"consecutive offsets",
);

// first type has an unused element
let view = ArrayView::DenseUnion(DenseUnionArrayView {
types: &[0, 0, 0],
offsets: &[0, 1, 4],
fields: fields.clone(),
});
assert_error_contains(
&ArrayDeserializer::new(String::from("foo"), None, view),
"consecutive offsets",
);
}
1 change: 1 addition & 0 deletions serde_arrow/src/test/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod api_chrono;
mod deserialization;
mod error_messages;
mod jiff;
mod schema_like;
Expand Down
Loading

0 comments on commit 32c1e2d

Please sign in to comment.