Skip to content

Commit

Permalink
Add support for interval types
Browse files Browse the repository at this point in the history
  • Loading branch information
chmp committed Oct 13, 2024
1 parent 73077ea commit 05c8817
Show file tree
Hide file tree
Showing 11 changed files with 340 additions and 6 deletions.
8 changes: 4 additions & 4 deletions marrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ arrow-38 = ["dep:arrow-array-38", "dep:arrow-schema-38", "dep:arrow-data-38", "d
arrow-37 = ["dep:arrow-array-37", "dep:arrow-schema-37", "dep:arrow-data-37", "dep:arrow-buffer-37"]

# support for different arrow2 versions
arrow2-0-17 = ["dep:arrow2-0-17", "dep:bytemuck", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "dep:bytemuck", "half/bytemuck"]
arrow2-0-17 = ["dep:arrow2-0-17", "half/bytemuck"]
arrow2-0-16 = ["dep:arrow2-0-16", "half/bytemuck"]

[dependencies]
bytemuck = { version = "1", default-features = false, features = ["derive"] }
half = { version = "2", default-features = false }

serde = { version = "1.0", default-features = false, features = ["std", "derive"], optional = true }

# arrow-version:insert: arrow-array-{version} = {{ package = "arrow-array", version = "{version}", optional = true, default-features = false }}
Expand Down Expand Up @@ -125,5 +127,3 @@ arrow-schema-37 = { package = "arrow-schema", version = "37", optional = true, d

arrow2-0-17 = { package = "arrow2", version = "0.17", optional = true, default-features = false }
arrow2-0-16 = { package = "arrow2", version = "0.16", optional = true, default-features = false }

bytemuck = { version = "1", optional = true, default-features = false }
16 changes: 16 additions & 0 deletions marrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use half::f16;
use crate::{
datatypes::{FieldMeta, MapMeta, TimeUnit},
error::{fail, ErrorKind, Result},
types::{DayTimeInterval, MonthDayNanoInterval},
view::{
BitsWithOffset, BooleanView, BytesView, DecimalView, DenseUnionView, DictionaryView,
FixedSizeBinaryView, FixedSizeListView, ListView, MapView, NullView, PrimitiveView,
Expand Down Expand Up @@ -59,6 +60,18 @@ pub enum Array {
Timestamp(TimestampArray),
/// An `i64` array of durations
Duration(TimeArray<i64>),
/// Interval with `YearMonth` unit
///
/// Interval arrays are not supported for `arrow2`.
YearMonthInterval(PrimitiveArray<i32>),
/// Interval with `DayTime` unit
///
/// Interval arrays are not supported for `arrow2`.
DayTimeInterval(PrimitiveArray<DayTimeInterval>),
/// Interval with `MonthDayNano` unit
///
/// Interval arrays are not supported for `arrow2`.
MonthDayNanoInterval(PrimitiveArray<MonthDayNanoInterval>),
/// A `[u8]` array with `i32` offsets of strings
Utf8(BytesArray<i32>),
/// A `[u8]` array with `i64` offsets of strings
Expand Down Expand Up @@ -113,6 +126,9 @@ impl Array {
Self::Time64(array) => View::Time64(array.as_view()),
Self::Timestamp(array) => View::Timestamp(array.as_view()),
Self::Duration(array) => View::Duration(array.as_view()),
Self::YearMonthInterval(array) => View::YearMonthInterval(array.as_view()),
Self::DayTimeInterval(array) => View::DayTimeInterval(array.as_view()),
Self::MonthDayNanoInterval(array) => View::MonthDayNanoInterval(array.as_view()),
Self::Binary(array) => View::Binary(array.as_view()),
Self::LargeBinary(array) => View::LargeBinary(array.as_view()),
Self::FixedSizeBinary(array) => View::FixedSizeBinary(array.as_view()),
Expand Down
17 changes: 17 additions & 0 deletions marrow/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,20 @@ impl From<std::num::TryFromIntError> for MarrowError {
)
}
}

impl From<bytemuck::PodCastError> for MarrowError {
fn from(err: bytemuck::PodCastError) -> Self {
let err = match err {
bytemuck::PodCastError::TargetAlignmentGreaterAndInputNotAligned => {
"TargetAlignmentGreaterAndInputNotAligned"
}
bytemuck::PodCastError::OutputSliceWouldHaveSlop => "OutputSliceWouldHaveSlop",
bytemuck::PodCastError::SizeMismatch => "SizeMismatch",
bytemuck::PodCastError::AlignmentMismatch => "AlignmentMismatch",
};
MarrowError::new(
ErrorKind::Unsupported,
format!("bytemuck::PodCastError: {err}"),
)
}
}
48 changes: 48 additions & 0 deletions marrow/src/impl_arrow/impl_api_base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,26 @@ fn build_array_data(value: Array) -> Result<arrow_data::ArrayData> {
arr.validity,
arr.values,
),
A::YearMonthInterval(arr) => primitive_into_data(
arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth),
arr.validity,
arr.values,
),
A::DayTimeInterval(arr) => primitive_into_data(
arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime),
arr.validity,
// NOTE: bytemuck::allocation::try_cast_vec enforces exact alignment. This cannot be
// guaranteed between different arrow version (arrow < 52 used i64, arrow >= 52 has its
// own type with different alignment). Therefore covert the vector elementwise and
// create a new vector.
try_cast_vec::<_, i64>(arr.values)?,
),
A::MonthDayNanoInterval(arr) => primitive_into_data(
arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
arr.validity,
// See note for A::DayTimeInterval
try_cast_vec::<_, i128>(arr.values)?,
),
A::Decimal128(arr) => primitive_into_data(
arrow_schema::DataType::Decimal128(arr.precision, arr.scale),
arr.validity,
Expand Down Expand Up @@ -725,6 +745,26 @@ impl<'a> TryFrom<&'a dyn arrow_array::Array> for View<'a> {
validity: get_bits_with_offset(array),
values: array.values(),
}))
} else if let Some(array) = any.downcast_ref::<arrow_array::IntervalYearMonthArray>() {
Ok(View::YearMonthInterval(PrimitiveView {
validity: get_bits_with_offset(array),
values: array.values(),
}))
} else if let Some(array) = any.downcast_ref::<arrow_array::IntervalDayTimeArray>() {
Ok(View::DayTimeInterval(PrimitiveView {
validity: get_bits_with_offset(array),
// bytemuck checks the dynamically. This check always succeeds if the the target
// alignment is smaller or equal to the source alignment. This is the case here, as
// structs are aligned to their largest field (which is at most 64 bits) and arrow
// aligns to 64 bits.
values: bytemuck::try_cast_slice(array.values().inner().as_slice())?,
}))
} else if let Some(array) = any.downcast_ref::<arrow_array::IntervalMonthDayNanoArray>() {
Ok(View::MonthDayNanoInterval(PrimitiveView {
validity: get_bits_with_offset(array),
// See note for DayTimeInterval
values: bytemuck::try_cast_slice(array.values().inner().as_slice())?,
}))
} else if let Some(array) = any.downcast_ref::<arrow_array::StringArray>() {
Ok(View::Utf8(BytesView {
validity: get_bits_with_offset(array),
Expand Down Expand Up @@ -987,3 +1027,11 @@ fn get_bits_with_offset(array: &dyn arrow_array::Array) -> Option<BitsWithOffset
data: validity.validity(),
})
}

fn try_cast_vec<A: bytemuck::NoUninit, B: bytemuck::AnyBitPattern>(a: Vec<A>) -> Result<Vec<B>> {
let mut res = Vec::new();
for item in a {
res.push(bytemuck::try_cast(item)?);
}
Ok(res)
}
4 changes: 4 additions & 0 deletions marrow/src/impl_arrow2/impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,10 @@ impl TryFrom<Array> for Box<dyn arrow2::array::Array> {
validity,
)?))
}
A::YearMonthInterval(_) | A::DayTimeInterval(_) | A::MonthDayNanoInterval(_) => fail!(
ErrorKind::Unsupported,
"Interval arrays are not supported for arrow2"
),
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions marrow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ pub mod datatypes;
#[deny(missing_docs)]
pub mod error;
#[deny(missing_docs)]
pub mod types;
#[deny(missing_docs)]
pub mod view;

mod impl_arrow;
Expand Down
35 changes: 35 additions & 0 deletions marrow/src/types.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//! Specialized element types of arrays

/// Represent a calendar interval as days and milliseconds
#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)]
#[repr(C)]
pub struct DayTimeInterval {
/// The number of days in the interval
pub days: i32,
/// The number of milliseconds in the interval
pub milliseconds: i32,
}

/// Represent a calendar interval as months, days and nanoseconds
#[derive(Debug, PartialEq, Clone, Copy, bytemuck::AnyBitPattern, bytemuck::NoUninit)]
#[repr(C)]
pub struct MonthDayNanoInterval {
/// The number of months in the interval
pub months: i32,
/// The number of days in the interval
pub days: i32,
/// The number of nanoseconds in the interval
pub nanoseconds: i64,
}

#[test]
fn interval_sizes() {
assert_eq!(
std::mem::size_of::<DayTimeInterval>(),
std::mem::size_of::<i64>()
);
assert_eq!(
std::mem::size_of::<MonthDayNanoInterval>(),
std::mem::size_of::<i128>()
);
}
7 changes: 7 additions & 0 deletions marrow/src/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use half::f16;
use crate::{
datatypes::{FieldMeta, MapMeta, TimeUnit},
error::{fail, ErrorKind, Result},
types::{DayTimeInterval, MonthDayNanoInterval},
};

// assert that the `Array` implements the expected traits
Expand Down Expand Up @@ -58,6 +59,12 @@ pub enum View<'a> {
Timestamp(TimestampView<'a>),
/// See [`Array::Duration`][crate::array::Array::Duration]
Duration(TimeView<'a, i64>),
/// See [`Array::YearMonthInterval`][crate::array::Array::YearMonthInterval]
YearMonthInterval(PrimitiveView<'a, i32>),
/// See [`Array::DayTimeInterval`][crate::array::Array::DayTimeInterval]
DayTimeInterval(PrimitiveView<'a, DayTimeInterval>),
/// See [`Array::MonthDayNanoInterval`][crate::array::Array::MonthDayNanoInterval]
MonthDayNanoInterval(PrimitiveView<'a, MonthDayNanoInterval>),
/// See [`Array::Utf8`][crate::array::Array::Utf8]
Utf8(BytesView<'a, i32>),
/// See [`Array::LargeUtf8`][crate::array::Array::LargeUtf8]
Expand Down
4 changes: 2 additions & 2 deletions test_with_arrow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ macro_rules! define_test_module {
}

// arrow-version:insert: define_test_module!("arrow-{version}", arrow_{version}, arrow_array_{version}, arrow_schema_{version}, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays);
define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays);
define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, union_arrays);
define_test_module!("arrow-53", arrow_53, arrow_array_53, arrow_schema_53, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays);
define_test_module!("arrow-52", arrow_52, arrow_array_52, arrow_schema_52, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays, intervals, union_arrays);
define_test_module!("arrow-51", arrow_51, arrow_array_51, arrow_schema_51, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays);
define_test_module!("arrow-50", arrow_50, arrow_array_50, arrow_schema_50, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays);
define_test_module!("arrow-49", arrow_49, arrow_array_49, arrow_schema_49, utils, arrays, data_types,struct_arrays, fixed_size_binary_arrays);
Expand Down
Loading

0 comments on commit 05c8817

Please sign in to comment.