From 05c88177f877d09ce5a7500e3166f48ce25935ad Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 13 Oct 2024 13:47:29 +0200 Subject: [PATCH] Add support for interval types --- marrow/Cargo.toml | 8 +- marrow/src/array.rs | 16 +++ marrow/src/error.rs | 17 +++ marrow/src/impl_arrow/impl_api_base.rs | 48 ++++++++ marrow/src/impl_arrow2/impl.rs | 4 + marrow/src/lib.rs | 2 + marrow/src/types.rs | 35 ++++++ marrow/src/view.rs | 7 ++ test_with_arrow/src/lib.rs | 4 +- test_with_arrow/src/tests/arrays.rs | 160 +++++++++++++++++++++++++ test_with_arrow/src/tests/intervals.rs | 45 +++++++ 11 files changed, 340 insertions(+), 6 deletions(-) create mode 100644 marrow/src/types.rs create mode 100644 test_with_arrow/src/tests/intervals.rs diff --git a/marrow/Cargo.toml b/marrow/Cargo.toml index 98f6877..9ee68d1 100644 --- a/marrow/Cargo.toml +++ b/marrow/Cargo.toml @@ -40,11 +40,13 @@ arrow-38 = ["dep:arrow-array-38", "dep:arrow-schema-38", "dep:arrow-data-38", "d arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"] # support for different arrow2 versions -arrow2-0-17 = ["dep:arrow2-0-17", "dep:bytemuck", "half/bytemuck"] -arrow2-0-16 = ["dep:arrow2-0-16", "dep:bytemuck", "half/bytemuck"] +arrow2-0-17 = ["dep:arrow2-0-17", "half/bytemuck"] +arrow2-0-16 = ["dep:arrow2-0-16", "half/bytemuck"] [dependencies] +bytemuck = { version = "1", default-features = false, features = ["derive"] } half = { version = "2", default-features = false } + serde = { version = "1.0", default-features = false, features = ["std", "derive"], optional = true } # arrow-version:insert: arrow-array-{version} = {{ package = "arrow-array", version = "{version}", optional = true, default-features = false }} @@ -125,5 +127,3 @@ arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true, d arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true, default-features = false } arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true, default-features = false } - -bytemuck = { version = "1", optional = true, default-features = false } diff --git a/marrow/src/array.rs b/marrow/src/array.rs index e562d96..2503f51 100644 --- a/marrow/src/array.rs +++ b/marrow/src/array.rs @@ -4,6 +4,7 @@ use half::f16; use crate::{ datatypes::{FieldMeta, MapMeta, TimeUnit}, error::{fail, ErrorKind, Result}, + types::{DayTimeInterval, MonthDayNanoInterval}, view::{ BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView, FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView, @@ -59,6 +60,18 @@ pub enum Array { Timestamp(TimestampArray), /// An `i64` array of durations Duration(TimeArray), + /// Interval with `YearMonth` unit + /// + /// Interval arrays are not supported for `arrow2`. + YearMonthInterval(PrimitiveArray), + /// Interval with `DayTime` unit + /// + /// Interval arrays are not supported for `arrow2`. + DayTimeInterval(PrimitiveArray), + /// Interval with `MonthDayNano` unit + /// + /// Interval arrays are not supported for `arrow2`. + MonthDayNanoInterval(PrimitiveArray), /// A `[u8]` array with `i32` offsets of strings Utf8(BytesArray), /// A `[u8]` array with `i64` offsets of strings @@ -113,6 +126,9 @@ impl Array { Self::Time64(array) => View::Time64(array.as_view()), Self::Timestamp(array) => View::Timestamp(array.as_view()), Self::Duration(array) => View::Duration(array.as_view()), + Self::YearMonthInterval(array) => View::YearMonthInterval(array.as_view()), + Self::DayTimeInterval(array) => View::DayTimeInterval(array.as_view()), + Self::MonthDayNanoInterval(array) => View::MonthDayNanoInterval(array.as_view()), Self::Binary(array) => View::Binary(array.as_view()), Self::LargeBinary(array) => View::LargeBinary(array.as_view()), Self::FixedSizeBinary(array) => View::FixedSizeBinary(array.as_view()), diff --git a/marrow/src/error.rs b/marrow/src/error.rs index 2d805fa..62ec3bd 100644 --- a/marrow/src/error.rs +++ b/marrow/src/error.rs @@ -163,3 +163,20 @@ impl From for MarrowError { ) } } + +impl From for MarrowError { + fn from(err: bytemuck::PodCastError) -> Self { + let err = match err { + bytemuck::PodCastError::TargetAlignmentGreaterAndInputNotAligned => { + "TargetAlignmentGreaterAndInputNotAligned" + } + bytemuck::PodCastError::OutputSliceWouldHaveSlop => "OutputSliceWouldHaveSlop", + bytemuck::PodCastError::SizeMismatch => "SizeMismatch", + bytemuck::PodCastError::AlignmentMismatch => "AlignmentMismatch", + }; + MarrowError::new( + ErrorKind::Unsupported, + format!("bytemuck::PodCastError: {err}"), + ) + } +} diff --git a/marrow/src/impl_arrow/impl_api_base.rs b/marrow/src/impl_arrow/impl_api_base.rs index 1319150..221e7a9 100644 --- a/marrow/src/impl_arrow/impl_api_base.rs +++ b/marrow/src/impl_arrow/impl_api_base.rs @@ -353,6 +353,26 @@ fn build_array_data(value: Array) -> Result { arr.validity, arr.values, ), + A::YearMonthInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), + arr.validity, + arr.values, + ), + A::DayTimeInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), + arr.validity, + // NOTE: bytemuck::allocation::try_cast_vec enforces exact alignment. This cannot be + // guaranteed between different arrow version (arrow < 52 used i64, arrow >= 52 has its + // own type with different alignment). Therefore covert the vector elementwise and + // create a new vector. + try_cast_vec::<_, i64>(arr.values)?, + ), + A::MonthDayNanoInterval(arr) => primitive_into_data( + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), + arr.validity, + // See note for A::DayTimeInterval + try_cast_vec::<_, i128>(arr.values)?, + ), A::Decimal128(arr) => primitive_into_data( arrow_schema::DataType::Decimal128(arr.precision, arr.scale), arr.validity, @@ -725,6 +745,26 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> { validity: get_bits_with_offset(array), values: array.values(), })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::YearMonthInterval(PrimitiveView { + validity: get_bits_with_offset(array), + values: array.values(), + })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::DayTimeInterval(PrimitiveView { + validity: get_bits_with_offset(array), + // bytemuck checks the dynamically. This check always succeeds if the the target + // alignment is smaller or equal to the source alignment. This is the case here, as + // structs are aligned to their largest field (which is at most 64 bits) and arrow + // aligns to 64 bits. + values: bytemuck::try_cast_slice(array.values().inner().as_slice())?, + })) + } else if let Some(array) = any.downcast_ref::() { + Ok(View::MonthDayNanoInterval(PrimitiveView { + validity: get_bits_with_offset(array), + // See note for DayTimeInterval + values: bytemuck::try_cast_slice(array.values().inner().as_slice())?, + })) } else if let Some(array) = any.downcast_ref::() { Ok(View::Utf8(BytesView { validity: get_bits_with_offset(array), @@ -987,3 +1027,11 @@ fn get_bits_with_offset(array: &dyn arrow_array::Array) -> Option(a: Vec) -> Result> { + let mut res = Vec::new(); + for item in a { + res.push(bytemuck::try_cast(item)?); + } + Ok(res) +} diff --git a/marrow/src/impl_arrow2/impl.rs b/marrow/src/impl_arrow2/impl.rs index 4e15388..d8e701d 100644 --- a/marrow/src/impl_arrow2/impl.rs +++ b/marrow/src/impl_arrow2/impl.rs @@ -515,6 +515,10 @@ impl TryFrom for Box { validity, )?)) } + A::YearMonthInterval(_) | A::DayTimeInterval(_) | A::MonthDayNanoInterval(_) => fail!( + ErrorKind::Unsupported, + "Interval arrays are not supported for arrow2" + ), } } } diff --git a/marrow/src/lib.rs b/marrow/src/lib.rs index 9f777f8..b46ec87 100644 --- a/marrow/src/lib.rs +++ b/marrow/src/lib.rs @@ -128,6 +128,8 @@ pub mod datatypes; #[deny(missing_docs)] pub mod error; #[deny(missing_docs)] +pub mod types; +#[deny(missing_docs)] pub mod view; mod impl_arrow; diff --git a/marrow/src/types.rs b/marrow/src/types.rs new file mode 100644 index 0000000..ec64d1e --- /dev/null +++ b/marrow/src/types.rs @@ -0,0 +1,35 @@ +//! Specialized element types of arrays + +/// Represent a calendar interval as days and milliseconds +#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)] +#[repr(C)] +pub struct DayTimeInterval { + /// The number of days in the interval + pub days: i32, + /// The number of milliseconds in the interval + pub milliseconds: i32, +} + +/// Represent a calendar interval as months, days and nanoseconds +#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)] +#[repr(C)] +pub struct MonthDayNanoInterval { + /// The number of months in the interval + pub months: i32, + /// The number of days in the interval + pub days: i32, + /// The number of nanoseconds in the interval + pub nanoseconds: i64, +} + +#[test] +fn interval_sizes() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::() + ); + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::() + ); +} diff --git a/marrow/src/view.rs b/marrow/src/view.rs index 8097359..5a01cc0 100644 --- a/marrow/src/view.rs +++ b/marrow/src/view.rs @@ -6,6 +6,7 @@ use half::f16; use crate::{ datatypes::{FieldMeta, MapMeta, TimeUnit}, error::{fail, ErrorKind, Result}, + types::{DayTimeInterval, MonthDayNanoInterval}, }; // assert that the `Array` implements the expected traits @@ -58,6 +59,12 @@ pub enum View<'a> { Timestamp(TimestampView<'a>), /// See [`Array::Duration`][crate::array::Array::Duration] Duration(TimeView<'a, i64>), + /// See [`Array::YearMonthInterval`][crate::array::Array::YearMonthInterval] + YearMonthInterval(PrimitiveView<'a, i32>), + /// See [`Array::DayTimeInterval`][crate::array::Array::DayTimeInterval] + DayTimeInterval(PrimitiveView<'a, DayTimeInterval>), + /// See [`Array::MonthDayNanoInterval`][crate::array::Array::MonthDayNanoInterval] + MonthDayNanoInterval(PrimitiveView<'a, MonthDayNanoInterval>), /// See [`Array::Utf8`][crate::array::Array::Utf8] Utf8(BytesView<'a, i32>), /// See [`Array::LargeUtf8`][crate::array::Array::LargeUtf8] diff --git a/test_with_arrow/src/lib.rs b/test_with_arrow/src/lib.rs index 17fb4f3..09e92de 100644 --- a/test_with_arrow/src/lib.rs +++ b/test_with_arrow/src/lib.rs @@ -17,8 +17,8 @@ macro_rules! define_test_module { } // arrow-version:insert: define_test_module!("arrow-{version}", arrow_{version}, arrow_array_{version}, arrow_schema_{version}, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); -define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); -define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays); +define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); +define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays); define_test_module!("arrow-51", arrow_51, arrow_array_51, arrow_schema_51, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); define_test_module!("arrow-50", arrow_50, arrow_array_50, arrow_schema_50, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); define_test_module!("arrow-49", arrow_49, arrow_array_49, arrow_schema_49, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays); diff --git a/test_with_arrow/src/tests/arrays.rs b/test_with_arrow/src/tests/arrays.rs index 155fd15..ccb6479 100644 --- a/test_with_arrow/src/tests/arrays.rs +++ b/test_with_arrow/src/tests/arrays.rs @@ -752,6 +752,166 @@ mod duration_nanosecond { } } +mod interval_year_month { + use super::*; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![1, 2, 3]), + Array::YearMonthInterval(PrimitiveArray { + validity: None, + values: vec![1, 2, 3], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![Some(1), None, Some(3)]), + Array::YearMonthInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![1, 0, 3], + }), + ) + } +} + +mod internval_day_time { + use super::*; + + use marrow::types::DayTimeInterval; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalDayTimeType::make_value(1, 2), + arrow_array::types::IntervalDayTimeType::make_value(3, 4), + arrow_array::types::IntervalDayTimeType::make_value(5, 6), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: None, + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 3, + milliseconds: 4, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalDayTimeType::make_value(1, 2)), + None, + Some(arrow_array::types::IntervalDayTimeType::make_value(5, 6)), + ]), + Array::DayTimeInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + DayTimeInterval { + days: 1, + milliseconds: 2, + }, + DayTimeInterval { + days: 0, + milliseconds: 0, + }, + DayTimeInterval { + days: 5, + milliseconds: 6, + }, + ], + }), + ) + } +} + +mod interval_month_day_nano { + use super::*; + + use marrow::types::MonthDayNanoInterval; + + #[test] + fn not_nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + arrow_array::types::IntervalMonthDayNanoType::make_value(1, 2, 3), + arrow_array::types::IntervalMonthDayNanoType::make_value(4, 5, 6), + arrow_array::types::IntervalMonthDayNanoType::make_value(7, 8, 9), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: None, + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 4, + days: 5, + nanoseconds: 6, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } + + #[test] + fn nullable() -> PanicOnError<()> { + assert_arrays_eq( + as_array_ref::(vec![ + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 1, 2, 3, + )), + None, + Some(arrow_array::types::IntervalMonthDayNanoType::make_value( + 7, 8, 9, + )), + ]), + Array::MonthDayNanoInterval(PrimitiveArray { + validity: Some(vec![0b_101]), + values: vec![ + MonthDayNanoInterval { + months: 1, + days: 2, + nanoseconds: 3, + }, + MonthDayNanoInterval { + months: 0, + days: 0, + nanoseconds: 0, + }, + MonthDayNanoInterval { + months: 7, + days: 8, + nanoseconds: 9, + }, + ], + }), + ) + } +} + mod timestamp_second { use super::*; diff --git a/test_with_arrow/src/tests/intervals.rs b/test_with_arrow/src/tests/intervals.rs new file mode 100644 index 0000000..64e8e27 --- /dev/null +++ b/test_with_arrow/src/tests/intervals.rs @@ -0,0 +1,45 @@ +// check the layout of the interval types + +#[test] +fn interval_layout_day_time() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::(), + ); + assert_eq!( + std::mem::align_of::(), + std::mem::align_of::(), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalDayTime, days), + std::mem::offset_of!(marrow::types::DayTimeInterval, days), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalDayTime, milliseconds), + std::mem::offset_of!(marrow::types::DayTimeInterval, milliseconds), + ); +} + +#[test] +fn interval_layout_month_day_nano() { + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::(), + ); + assert_eq!( + std::mem::align_of::(), + std::mem::align_of::(), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, months), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, months), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, days), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, days), + ); + assert_eq!( + std::mem::offset_of!(arrow_array::types::IntervalMonthDayNano, nanoseconds), + std::mem::offset_of!(marrow::types::MonthDayNanoInterval, nanoseconds), + ); +}