From d336b3f4ccacbfdbf5334345081b647f45a305a2 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 22 Oct 2024 22:09:57 +0200 Subject: [PATCH 01/10] Add failing example --- .../test_with_arrow/issue_248_sliced_lists.rs | 43 +++++++++++++++++++ serde_arrow/src/test_with_arrow/mod.rs | 1 + 2 files changed, 44 insertions(+) create mode 100644 serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs diff --git a/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs b/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs new file mode 100644 index 00000000..6d35ab4d --- /dev/null +++ b/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs @@ -0,0 +1,43 @@ +use crate::_impl::arrow::datatypes::FieldRef; +use crate::{ + self as serde_arrow, + schema::{SchemaLike, TracingOptions}, +}; + +#[test] +fn strings() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + struct Struct { + string: String, + } + let data = (1..=10) + .map(|x| Struct { + string: x.to_string(), + }) + .collect::>(); + let fields = Vec::::from_type::(TracingOptions::default()).unwrap(); + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(5, 5); + + let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[5..10], actual); +} + +#[test] +fn vec_of_strings() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + struct Struct { + string: Vec, + } + let data = (1..=10) + .map(|x| Struct { + string: vec![x.to_string()], + }) + .collect::>(); + let fields = Vec::::from_type::(TracingOptions::default()).unwrap(); + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(5, 5); + + let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[5..10], actual); +} diff --git a/serde_arrow/src/test_with_arrow/mod.rs b/serde_arrow/src/test_with_arrow/mod.rs index e3aed06a..82d61c0f 100644 --- a/serde_arrow/src/test_with_arrow/mod.rs +++ b/serde_arrow/src/test_with_arrow/mod.rs @@ -2,6 +2,7 @@ //! mod impls; mod issue_137_schema_like_from_arrow_schema; +mod issue_248_sliced_lists; mod issue_35_preserve_metadata; mod issue_90_top_level_nulls_in_structs; mod items_wrapper; From d11f139d55fd84beb5c7d2318553174a2c0d7a89 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 22 Oct 2024 22:39:15 +0200 Subject: [PATCH 02/10] Add quickfix for sliced lists --- .../src/internal/deserialization/list_deserializer.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/serde_arrow/src/internal/deserialization/list_deserializer.rs b/serde_arrow/src/internal/deserialization/list_deserializer.rs index c93a81c1..cbe758cd 100644 --- a/serde_arrow/src/internal/deserialization/list_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/list_deserializer.rs @@ -23,12 +23,17 @@ pub struct ListDeserializer<'a, O: Offset> { impl<'a, O: Offset> ListDeserializer<'a, O> { pub fn new( path: String, - item: ArrayDeserializer<'a>, + mut item: ArrayDeserializer<'a>, offsets: &'a [O], validity: Option>, ) -> Result { check_supported_list_layout(validity, offsets)?; + // skip any unused values up front + for _ in 0..offsets[0].try_into_usize()? { + item.deserialize_ignored_any(serde::de::IgnoredAny)?; + } + Ok(Self { path, item: Box::new(item), From 54638c08f17fb6b27f2bb2d7b56d0ca5eb1d494b Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 22 Oct 2024 22:42:41 +0200 Subject: [PATCH 03/10] Fix maps as well --- .../deserialization/map_deserializer.rs | 10 ++++++-- .../test_with_arrow/issue_248_sliced_lists.rs | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/serde_arrow/src/internal/deserialization/map_deserializer.rs b/serde_arrow/src/internal/deserialization/map_deserializer.rs index e08fca98..1caedfda 100644 --- a/serde_arrow/src/internal/deserialization/map_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/map_deserializer.rs @@ -24,13 +24,19 @@ pub struct MapDeserializer<'a> { impl<'a> MapDeserializer<'a> { pub fn new( path: String, - key: ArrayDeserializer<'a>, - value: ArrayDeserializer<'a>, + mut key: ArrayDeserializer<'a>, + mut value: ArrayDeserializer<'a>, offsets: &'a [i32], validity: Option>, ) -> Result { check_supported_list_layout(validity, offsets)?; + // skip any unused values up front + for _ in 0..usize::try_from(offsets[0])? { + key.deserialize_ignored_any(serde::de::IgnoredAny)?; + value.deserialize_ignored_any(serde::de::IgnoredAny)?; + } + Ok(Self { path, key: Box::new(key), diff --git a/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs b/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs index 6d35ab4d..da4fbe4a 100644 --- a/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs +++ b/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs @@ -1,4 +1,7 @@ +use std::collections::HashMap; + use crate::_impl::arrow::datatypes::FieldRef; +use crate::internal::testing::hash_map; use crate::{ self as serde_arrow, schema::{SchemaLike, TracingOptions}, @@ -41,3 +44,24 @@ fn vec_of_strings() { let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); assert_eq!(data[5..10], actual); } + +#[test] +fn map_of_strings() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + struct Struct { + string: HashMap, + } + let data = (1..=10) + .map(|x| Struct { + string: hash_map!(x.to_string() => x.to_string()), + }) + .collect::>(); + let fields = + Vec::::from_type::(TracingOptions::default().map_as_struct(false)) + .unwrap(); + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(5, 5); + + let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[5..10], actual); +} From 6e08a503cddf1571fa8ffb618a1df65ef034f12f Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Tue, 22 Oct 2024 22:59:11 +0200 Subject: [PATCH 04/10] add test for bytes --- ...rs => issue_248_slices_deserialization.rs} | 24 +++++++++++++++++++ serde_arrow/src/test_with_arrow/mod.rs | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) rename serde_arrow/src/test_with_arrow/{issue_248_sliced_lists.rs => issue_248_slices_deserialization.rs} (75%) diff --git a/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs similarity index 75% rename from serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs rename to serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs index da4fbe4a..9ad14109 100644 --- a/serde_arrow/src/test_with_arrow/issue_248_sliced_lists.rs +++ b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs @@ -1,5 +1,7 @@ use std::collections::HashMap; +use serde_json::json; + use crate::_impl::arrow::datatypes::FieldRef; use crate::internal::testing::hash_map; use crate::{ @@ -7,6 +9,28 @@ use crate::{ schema::{SchemaLike, TracingOptions}, }; +#[test] +fn bytes_as_list() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + struct Struct { + string: Vec, + } + let data = (1..=10) + .map(|x| Struct { + string: x.to_string().into_bytes(), + }) + .collect::>(); + let fields = Vec::::from_value(json!([ + {"name": "string", "data_type": "Binary"}, + ])) + .unwrap(); + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(5, 5); + + let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[5..10], actual); +} + #[test] fn strings() { #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] diff --git a/serde_arrow/src/test_with_arrow/mod.rs b/serde_arrow/src/test_with_arrow/mod.rs index 82d61c0f..38c4974a 100644 --- a/serde_arrow/src/test_with_arrow/mod.rs +++ b/serde_arrow/src/test_with_arrow/mod.rs @@ -2,7 +2,7 @@ //! mod impls; mod issue_137_schema_like_from_arrow_schema; -mod issue_248_sliced_lists; +mod issue_248_slices_deserialization; mod issue_35_preserve_metadata; mod issue_90_top_level_nulls_in_structs; mod items_wrapper; From e5c755029700815ea27acfef9df5af9526213ca5 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 23 Oct 2024 20:36:57 +0200 Subject: [PATCH 05/10] Fix impl of check_supported_list_layout --- .../src/internal/deserialization/utils.rs | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/serde_arrow/src/internal/deserialization/utils.rs b/serde_arrow/src/internal/deserialization/utils.rs index 851e1cde..b432dea5 100644 --- a/serde_arrow/src/internal/deserialization/utils.rs +++ b/serde_arrow/src/internal/deserialization/utils.rs @@ -77,10 +77,6 @@ pub fn check_supported_list_layout<'a, O: Offset>( validity: Option>, offsets: &'a [O], ) -> Result<()> { - let Some(validity) = validity else { - return Ok(()); - }; - if offsets.is_empty() { fail!("Unsupported: list offsets must be non empty"); } @@ -88,10 +84,37 @@ pub fn check_supported_list_layout<'a, O: Offset>( for i in 0..offsets.len().saturating_sub(1) { let curr = offsets[i].try_into_usize()?; let next = offsets[i + 1].try_into_usize()?; - if !bitset_is_set(&validity, i)? && (next - curr) != 0 { - fail!("Unsupported: lists with data in null values are currently not supported in deserialization"); + + if next < curr { + fail!("Unsupported: list offsets are assumed to be monotonically increasing"); + } + if let Some(validity) = validity.as_ref() { + if !bitset_is_set(&validity, i)? && (next - curr) != 0 { + fail!("Unsupported: lists with data in null values are currently not supported in deserialization"); + } } } Ok(()) } + +#[test] +fn test_check_supported_list_layout() { + use crate::internal::testing::assert_error_contains; + + assert_error_contains(&check_supported_list_layout::(None, &[]), "non empty"); + assert_error_contains( + &check_supported_list_layout::(None, &[0, 1, 0]), + "monotonically increasing", + ); + assert_error_contains( + &check_supported_list_layout::( + Some(BitsWithOffset { + offset: 0, + data: &[0b_101], + }), + &[0, 5, 10, 15], + ), + "data in null values", + ); +} From 88b80d797770f1cc8c7fccbd708cffff2427a74a Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 23 Oct 2024 20:39:01 +0200 Subject: [PATCH 06/10] Impl checks for supported enum layouts --- .../deserialization/array_deserializer.rs | 7 +++- .../deserialization/enum_deserializer.rs | 39 +++++++++++++++++-- .../issue_248_slices_deserialization.rs | 25 ++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/serde_arrow/src/internal/deserialization/array_deserializer.rs b/serde_arrow/src/internal/deserialization/array_deserializer.rs index 4707c4fa..68f57558 100644 --- a/serde_arrow/src/internal/deserialization/array_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/array_deserializer.rs @@ -296,7 +296,12 @@ impl<'a> ArrayDeserializer<'a> { fields.push((field_meta.name, field_deserializer)) } - Ok(Self::Enum(EnumDeserializer::new(path, view.types, fields))) + Ok(Self::Enum(EnumDeserializer::new( + path, + view.types, + view.offsets, + fields, + )?)) } } } diff --git a/serde_arrow/src/internal/deserialization/enum_deserializer.rs b/serde_arrow/src/internal/deserialization/enum_deserializer.rs index 6ad2fbfa..ac745e9d 100644 --- a/serde_arrow/src/internal/deserialization/enum_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/enum_deserializer.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use serde::de::{DeserializeSeed, Deserializer, EnumAccess, Visitor}; @@ -20,15 +20,48 @@ impl<'a> EnumDeserializer<'a> { pub fn new( path: String, type_ids: &'a [i8], + offsets: &'a [i32], variants: Vec<(String, ArrayDeserializer<'a>)>, - ) -> Self { - Self { + ) -> Result { + verify_offsets(type_ids, offsets)?; + + Ok(Self { path, type_ids, variants, next: 0, + }) + } +} + +fn verify_offsets(type_ids: &[i8], offsets: &[i32]) -> Result<()> { + if type_ids.len() != offsets.len() { + fail!("Offsets and type ids must have the same length") + } + + let mut last_offsets = HashMap::::new(); + for (idx, (&type_id, &offset)) in std::iter::zip(type_ids, offsets).enumerate() { + if let Some(last_offset) = last_offsets.get(&type_id).copied() { + if offset.checked_sub(last_offset) != Some(1) { + fail!( + concat!( + "Invalid offsets in enum array for item {idx}:", + "serde_arrow only supports consecutive offsets.", + "Current offset for type {type_id}: {offset}, previous offset {last_offset}", + ), + idx = idx, + type_id = type_id, + offset = offset, + last_offset = last_offset, + ); + } + last_offsets.insert(type_id, offset); + } else { + last_offsets.insert(type_id, offset); } } + + Ok(()) } impl<'de> Context for EnumDeserializer<'de> { diff --git a/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs index 9ad14109..7a9971d6 100644 --- a/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs +++ b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs @@ -4,6 +4,7 @@ use serde_json::json; use crate::_impl::arrow::datatypes::FieldRef; use crate::internal::testing::hash_map; +use crate::utils::Item; use crate::{ self as serde_arrow, schema::{SchemaLike, TracingOptions}, @@ -89,3 +90,27 @@ fn map_of_strings() { let actual: Vec = serde_arrow::from_record_batch(&batch).unwrap(); assert_eq!(data[5..10], actual); } + +#[test] +fn enums() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + enum Value { + I64(i64), + I32(i32), + Str(String), + } + + let data = vec![ + Item(Value::I64(0)), + Item(Value::I64(1)), + Item(Value::I32(2)), + Item(Value::Str(String::from("3"))), + ]; + let fields = Vec::::from_type::>(TracingOptions::default()).unwrap(); + + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(2, 2); + + let actual: Vec> = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[2..4], actual); +} From 90a9202e6139da1cab2c8f6a5197868696bab5d1 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Wed, 23 Oct 2024 21:26:30 +0200 Subject: [PATCH 07/10] Add tests for correctness checks of array deserializers --- .../deserialization/enum_deserializer.rs | 12 ++++ serde_arrow/src/test/deserialization/list.rs | 40 +++++++++++ serde_arrow/src/test/deserialization/mod.rs | 2 + serde_arrow/src/test/deserialization/union.rs | 68 +++++++++++++++++++ serde_arrow/src/test/mod.rs | 1 + 5 files changed, 123 insertions(+) create mode 100644 serde_arrow/src/test/deserialization/list.rs create mode 100644 serde_arrow/src/test/deserialization/mod.rs create mode 100644 serde_arrow/src/test/deserialization/union.rs diff --git a/serde_arrow/src/internal/deserialization/enum_deserializer.rs b/serde_arrow/src/internal/deserialization/enum_deserializer.rs index ac745e9d..55cdfb03 100644 --- a/serde_arrow/src/internal/deserialization/enum_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/enum_deserializer.rs @@ -57,6 +57,18 @@ fn verify_offsets(type_ids: &[i8], offsets: &[i32]) -> Result<()> { } last_offsets.insert(type_id, offset); } else { + if offset != 0 { + fail!( + concat!( + "Invalid offsets in enum array for item {idx}:", + "serde_arrow only supports initial zero offsets.", + "Current offset for type {type_id}: {offset}", + ), + idx = idx, + type_id = type_id, + offset = offset, + ); + } last_offsets.insert(type_id, offset); } } diff --git a/serde_arrow/src/test/deserialization/list.rs b/serde_arrow/src/test/deserialization/list.rs new file mode 100644 index 00000000..8df4cb39 --- /dev/null +++ b/serde_arrow/src/test/deserialization/list.rs @@ -0,0 +1,40 @@ +use crate::internal::{ + arrow::{ArrayView, FieldMeta, ListArrayView, PrimitiveArrayView}, + deserialization::array_deserializer::ArrayDeserializer, + testing::assert_error_contains, +}; + +#[test] +fn invalid_offsets() { + let reference = ListArrayView { + validity: None, + offsets: &[], + meta: FieldMeta { + name: String::from("element"), + nullable: false, + metadata: Default::default(), + }, + element: Box::new(ArrayView::Int32(PrimitiveArrayView { + validity: None, + values: &[0, 1, 2, 3, 4, 5], + })), + }; + + let view = ArrayView::List(ListArrayView { + offsets: &[], + ..reference.clone() + }); + assert_error_contains( + &ArrayDeserializer::new(String::from("foo"), None, view), + "non empty", + ); + + let view = ArrayView::List(ListArrayView { + offsets: &[0, 5, 2], + ..reference.clone() + }); + assert_error_contains( + &ArrayDeserializer::new(String::from("foo"), None, view), + "monotonically increasing", + ); +} diff --git a/serde_arrow/src/test/deserialization/mod.rs b/serde_arrow/src/test/deserialization/mod.rs new file mode 100644 index 00000000..e92b292f --- /dev/null +++ b/serde_arrow/src/test/deserialization/mod.rs @@ -0,0 +1,2 @@ +mod list; +mod r#union; diff --git a/serde_arrow/src/test/deserialization/union.rs b/serde_arrow/src/test/deserialization/union.rs new file mode 100644 index 00000000..3699117e --- /dev/null +++ b/serde_arrow/src/test/deserialization/union.rs @@ -0,0 +1,68 @@ +use crate::internal::{ + arrow::{ArrayView, DenseUnionArrayView, FieldMeta, PrimitiveArrayView}, + deserialization::array_deserializer::ArrayDeserializer, + testing::assert_error_contains, +}; + +#[test] +fn non_consecutive_offsets() { + let fields = vec![ + ( + 0, + ArrayView::Int32(PrimitiveArrayView { + validity: None, + values: &[1, 2, 3, 4, 5, 6], + }), + FieldMeta { + name: String::from("foo"), + nullable: false, + metadata: Default::default(), + }, + ), + ( + 1, + ArrayView::Int32(PrimitiveArrayView { + validity: None, + values: &[1, 2, 3, 4, 5, 6], + }), + FieldMeta { + name: String::from("foo"), + nullable: false, + metadata: Default::default(), + }, + ), + ]; + + // second type does not start at 0 + let view = ArrayView::DenseUnion(DenseUnionArrayView { + types: &[0, 0, 1], + offsets: &[0, 1, 2], + fields: fields.clone(), + }); + assert_error_contains( + &ArrayDeserializer::new(String::from("foo"), None, view), + "initial zero offsets", + ); + + // first type has an unused element + let view = ArrayView::DenseUnion(DenseUnionArrayView { + types: &[0, 0, 1], + offsets: &[0, 2, 0], + fields: fields.clone(), + }); + assert_error_contains( + &ArrayDeserializer::new(String::from("foo"), None, view), + "consecutive offsets", + ); + + // first type has an unused element + let view = ArrayView::DenseUnion(DenseUnionArrayView { + types: &[0, 0, 0], + offsets: &[0, 1, 4], + fields: fields.clone(), + }); + assert_error_contains( + &ArrayDeserializer::new(String::from("foo"), None, view), + "consecutive offsets", + ); +} diff --git a/serde_arrow/src/test/mod.rs b/serde_arrow/src/test/mod.rs index d2a03003..da60f59a 100644 --- a/serde_arrow/src/test/mod.rs +++ b/serde_arrow/src/test/mod.rs @@ -1,4 +1,5 @@ mod api_chrono; +mod deserialization; mod error_messages; mod jiff; mod schema_like; From d7502f8c15b9e2e82564989e61347f9e4861b6ae Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 26 Oct 2024 18:53:46 +0200 Subject: [PATCH 08/10] Update changelog --- Changes.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Changes.md b/Changes.md index 9836d3ef..1ffe6c74 100644 --- a/Changes.md +++ b/Changes.md @@ -1,5 +1,19 @@ # Change log +## 0.12.2 + +Bug fixes: + +- Fixed deserialization from sliced arrays ([#248](https://github.com/chmp/serde_arrow/issues/248)). + Note that the current solution requires up front work when constructing the array deserializers, + as described in the issue. The removal of the performance penality is tracked in + ([#250](https://github.com/chmp/serde_arrow/issues/250)) + +### Thanks + +- [@jkylling](https://github.com/jkylling) for reporting + ([#248](https://github.com/chmp/serde_arrow/issues/248)) and for discussing potential solutions + ## 0.12.1 New features From 459a7db4c3773aeac60089ba899a05a5f0a1ec9a Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 26 Oct 2024 19:16:00 +0200 Subject: [PATCH 09/10] Fix sliced enums --- .../deserialization/enum_deserializer.rs | 52 +++++++++++++------ .../deserialization/list_deserializer.rs | 6 +-- .../deserialization/map_deserializer.rs | 8 ++- .../deserialization/simple_deserializer.rs | 12 ++++- serde_arrow/src/test/deserialization/union.rs | 11 ---- .../issue_248_slices_deserialization.rs | 27 ++++++++++ 6 files changed, 78 insertions(+), 38 deletions(-) diff --git a/serde_arrow/src/internal/deserialization/enum_deserializer.rs b/serde_arrow/src/internal/deserialization/enum_deserializer.rs index 55cdfb03..1d34c0fa 100644 --- a/serde_arrow/src/internal/deserialization/enum_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/enum_deserializer.rs @@ -21,9 +21,16 @@ impl<'a> EnumDeserializer<'a> { path: String, type_ids: &'a [i8], offsets: &'a [i32], - variants: Vec<(String, ArrayDeserializer<'a>)>, + mut variants: Vec<(String, ArrayDeserializer<'a>)>, ) -> Result { - verify_offsets(type_ids, offsets)?; + let initial_offsets = verify_offsets(type_ids, offsets, variants.len())?; + + for (type_id, initial_offset) in initial_offsets { + let Some((_, variant)) = variants.get_mut(type_id as usize) else { + fail!("Unexpected error: could not retrieve variant {type_id}"); + }; + variant.skip(initial_offset as usize)?; + } Ok(Self { path, @@ -34,13 +41,37 @@ impl<'a> EnumDeserializer<'a> { } } -fn verify_offsets(type_ids: &[i8], offsets: &[i32]) -> Result<()> { +fn verify_offsets(type_ids: &[i8], offsets: &[i32], num_fields: usize) -> Result> { if type_ids.len() != offsets.len() { fail!("Offsets and type ids must have the same length") } + for &type_id in type_ids { + if type_id as usize >= num_fields { + fail!( + concat!( + "Invalid enum array:", + "type id ({type_id}) larger the number of fields ({num_fields})", + ), + type_id = type_id, + num_fields = num_fields, + ); + } + } + let mut last_offsets = HashMap::::new(); + let mut initial_offsets = HashMap::::new(); for (idx, (&type_id, &offset)) in std::iter::zip(type_ids, offsets).enumerate() { + if offset < 0 { + fail!( + concat!( + "Invalid offsets in enum array for item {idx}:", + "negative offsets ({offset}) is not supports", + ), + idx = idx, + offset = offset, + ); + } if let Some(last_offset) = last_offsets.get(&type_id).copied() { if offset.checked_sub(last_offset) != Some(1) { fail!( @@ -57,23 +88,12 @@ fn verify_offsets(type_ids: &[i8], offsets: &[i32]) -> Result<()> { } last_offsets.insert(type_id, offset); } else { - if offset != 0 { - fail!( - concat!( - "Invalid offsets in enum array for item {idx}:", - "serde_arrow only supports initial zero offsets.", - "Current offset for type {type_id}: {offset}", - ), - idx = idx, - type_id = type_id, - offset = offset, - ); - } + initial_offsets.insert(type_id, offset); last_offsets.insert(type_id, offset); } } - Ok(()) + Ok(initial_offsets) } impl<'de> Context for EnumDeserializer<'de> { diff --git a/serde_arrow/src/internal/deserialization/list_deserializer.rs b/serde_arrow/src/internal/deserialization/list_deserializer.rs index cbe758cd..5976b4b4 100644 --- a/serde_arrow/src/internal/deserialization/list_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/list_deserializer.rs @@ -28,11 +28,7 @@ impl<'a, O: Offset> ListDeserializer<'a, O> { validity: Option>, ) -> Result { check_supported_list_layout(validity, offsets)?; - - // skip any unused values up front - for _ in 0..offsets[0].try_into_usize()? { - item.deserialize_ignored_any(serde::de::IgnoredAny)?; - } + item.skip(offsets[0].try_into_usize()?)?; Ok(Self { path, diff --git a/serde_arrow/src/internal/deserialization/map_deserializer.rs b/serde_arrow/src/internal/deserialization/map_deserializer.rs index 1caedfda..f578309c 100644 --- a/serde_arrow/src/internal/deserialization/map_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/map_deserializer.rs @@ -31,11 +31,9 @@ impl<'a> MapDeserializer<'a> { ) -> Result { check_supported_list_layout(validity, offsets)?; - // skip any unused values up front - for _ in 0..usize::try_from(offsets[0])? { - key.deserialize_ignored_any(serde::de::IgnoredAny)?; - value.deserialize_ignored_any(serde::de::IgnoredAny)?; - } + let initial_offset = usize::try_from(offsets[0])?; + key.skip(initial_offset)?; + value.skip(initial_offset)?; Ok(Self { path, diff --git a/serde_arrow/src/internal/deserialization/simple_deserializer.rs b/serde_arrow/src/internal/deserialization/simple_deserializer.rs index 4753bc86..d1ac7b7d 100644 --- a/serde_arrow/src/internal/deserialization/simple_deserializer.rs +++ b/serde_arrow/src/internal/deserialization/simple_deserializer.rs @@ -1,4 +1,7 @@ -use serde::{de::Visitor, Deserializer}; +use serde::{ + de::{IgnoredAny, Visitor}, + Deserializer, +}; use crate::internal::{ error::{fail, Context, Error, Result}, @@ -7,6 +10,13 @@ use crate::internal::{ #[allow(unused)] pub trait SimpleDeserializer<'de>: Context + Sized { + fn skip(&mut self, n: usize) -> Result<()> { + for _ in 0..n { + self.deserialize_any(IgnoredAny)?; + } + Ok(()) + } + fn deserialize_any>(&mut self, visitor: V) -> Result { fail!(in self, "Deserializer does not implement deserialize_any"); } diff --git a/serde_arrow/src/test/deserialization/union.rs b/serde_arrow/src/test/deserialization/union.rs index 3699117e..ec13feca 100644 --- a/serde_arrow/src/test/deserialization/union.rs +++ b/serde_arrow/src/test/deserialization/union.rs @@ -33,17 +33,6 @@ fn non_consecutive_offsets() { ), ]; - // second type does not start at 0 - let view = ArrayView::DenseUnion(DenseUnionArrayView { - types: &[0, 0, 1], - offsets: &[0, 1, 2], - fields: fields.clone(), - }); - assert_error_contains( - &ArrayDeserializer::new(String::from("foo"), None, view), - "initial zero offsets", - ); - // first type has an unused element let view = ArrayView::DenseUnion(DenseUnionArrayView { types: &[0, 0, 1], diff --git a/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs index 7a9971d6..86a7e890 100644 --- a/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs +++ b/serde_arrow/src/test_with_arrow/issue_248_slices_deserialization.rs @@ -100,6 +100,7 @@ fn enums() { Str(String), } + // Note: this works, as the initial offset for the remaining variants is 0 let data = vec![ Item(Value::I64(0)), Item(Value::I64(1)), @@ -114,3 +115,29 @@ fn enums() { let actual: Vec> = serde_arrow::from_record_batch(&batch).unwrap(); assert_eq!(data[2..4], actual); } + +#[test] +fn enums_2() { + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq)] + enum Value { + I64(i64), + I32(i32), + Str(String), + } + + // note use always the same variant to force the sliced offsets to result in non-zero initial + // offset for type 0 + let data = vec![ + Item(Value::I64(0)), + Item(Value::I64(1)), + Item(Value::I64(2)), + Item(Value::I64(3)), + ]; + let fields = Vec::::from_type::>(TracingOptions::default()).unwrap(); + + let batch = serde_arrow::to_record_batch(&fields, &data).unwrap(); + let batch = batch.slice(2, 2); + + let actual: Vec> = serde_arrow::from_record_batch(&batch).unwrap(); + assert_eq!(data[2..4], actual); +} From 43327f9181142c1d132f571498780a333a306469 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 26 Oct 2024 19:17:02 +0200 Subject: [PATCH 10/10] Fix typo --- Changes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changes.md b/Changes.md index 1ffe6c74..afc36427 100644 --- a/Changes.md +++ b/Changes.md @@ -6,7 +6,7 @@ Bug fixes: - Fixed deserialization from sliced arrays ([#248](https://github.com/chmp/serde_arrow/issues/248)). Note that the current solution requires up front work when constructing the array deserializers, - as described in the issue. The removal of the performance penality is tracked in + as described in the issue. The removal of the performance penalty is tracked in ([#250](https://github.com/chmp/serde_arrow/issues/250)) ### Thanks