From c42b19112d5eff9546b3e41774525d5b345b939a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 28 Sep 2024 06:54:31 -0400 Subject: [PATCH 1/7] Add better documentation, examples and builer-style API to `ByteView` --- arrow-array/src/array/byte_view_array.rs | 7 ++- arrow-data/src/byte_view.rs | 68 +++++++++++++++++++++++- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index c53478d8b057..67e8e4b5b6a3 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -44,8 +44,11 @@ use super::ByteArrayType; /// /// # See Also /// -/// See [`StringViewArray`] for storing utf8 encoded string data and -/// [`BinaryViewArray`] for storing bytes. +/// * [`StringViewArray`] for storing utf8 encoded string data +/// * [`BinaryViewArray`] for storing bytes +/// * [`ByteView`] to interpret `u128`s layout of the views. +/// +/// [`ByteView`]: arrow_data::ByteView /// /// # Notes /// diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 6f6d6d175689..98319d3a8eda 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -21,8 +21,40 @@ use arrow_schema::ArrowError; /// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and /// `BinaryViewArray`) where the length is greater than 12 bytes. /// -/// See the documentation on [`GenericByteViewArray`] for more information on -/// the layout of the views. +/// See Also: +/// * [`GenericByteViewArray`] for more information on the layout of the views. +/// * [`validate_binary_view`] and [`validate_string_view`] to validate +/// +/// # Example: Create a new u128 view +/// +/// ```rust +/// # use arrow_data::ByteView;; +/// // Create a view for a string of length 20 +/// // first four bytes are "Rust" +/// // stored in buffer 3 +/// // at offset 42 +/// let prefix = "Rust"; +/// let view = ByteView::new(20, prefix.as_bytes()) +/// .with_buffer_index(3) +/// .with_offset(42); +/// +/// // create the final u128 +/// let v = view.as_u128(); +/// assert_eq!(v, 0x2a000000037473755200000014); +/// ``` +/// +/// # Example: decode a `u128` into its constituent fields +/// ```rust +/// # use arrow_data::ByteView; +/// // Convert a u128 to a ByteView +/// // See validate_{string,binary}_view functions to validate +/// let v = ByteView::from(0x2a000000037473755200000014); +/// +/// assert_eq!(v.length, 20); +/// assert_eq!(v.prefix, 0x74737552); +/// assert_eq!(v.buffer_index, 3); +/// assert_eq!(v.offset, 42); +/// ``` /// /// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html #[derive(Debug, Copy, Clone, Default)] @@ -39,6 +71,38 @@ pub struct ByteView { } impl ByteView { + /// Construct a [`ByteView`] for data `length` of bytes with the specified prefix. + /// + /// See example on [`ByteView`] docs + /// + /// Notes: + /// * the length should always be greater than 12 (Data less than 12 + /// bytes is stored as an inline view) + /// * buffer and offset are set to `0` + /// + /// # Panics + /// If the prefix is not exactly 4 bytes + pub fn new(length: u32, prefix: &[u8]) -> Self { + Self { + length, + prefix: u32::from_le_bytes(prefix.try_into().unwrap()), + buffer_index: 0, + offset: 0, + } + } + + /// Set the [`Self::buffer_index`] field + pub fn with_buffer_index(mut self, buffer_index: u32) -> Self { + self.buffer_index = buffer_index; + self + } + + /// Set the [`Self::offset`] field + pub fn with_offset(mut self, offset: u32) -> Self { + self.offset = offset; + self + } + #[inline(always)] /// Convert `ByteView` to `u128` by concatenating the fields pub fn as_u128(self) -> u128 { From 9b1205e997eff1587ae083c476892118af7d1235 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 28 Sep 2024 08:03:43 -0400 Subject: [PATCH 2/7] simplify tests --- arrow-array/src/array/byte_view_array.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 67e8e4b5b6a3..fa3ec3d7692d 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -875,12 +875,9 @@ mod tests { #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")] fn new_with_invalid_view_data() { let v = "large payload over 12 bytes"; - let view = ByteView { - length: 13, - prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()), - buffer_index: 3, - offset: 1, - }; + let view = ByteView::new(13, &v.as_bytes()[0..4]) + .with_buffer_index(3) + .with_offset(1); let views = ScalarBuffer::from(vec![view.into()]); let buffers = vec![Buffer::from_slice_ref(v)]; StringViewArray::new(views, buffers, None); @@ -892,12 +889,7 @@ mod tests { )] fn new_with_invalid_utf8_data() { let v: Vec = vec![0xf0, 0x80, 0x80, 0x80]; - let view = ByteView { - length: v.len() as u32, - prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), - buffer_index: 0, - offset: 0, - }; + let view = ByteView::new(v.len() as u32, &v); let views = ScalarBuffer::from(vec![view.into()]); let buffers = vec![Buffer::from_slice_ref(v)]; StringViewArray::new(views, buffers, None); From 29dd1ec3e7a9222105ef94787a3040a42e8d8a32 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 30 Sep 2024 14:14:49 -0400 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-data/src/byte_view.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 98319d3a8eda..84ab5f853ec4 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -82,6 +82,7 @@ impl ByteView { /// /// # Panics /// If the prefix is not exactly 4 bytes + #[inline] pub fn new(length: u32, prefix: &[u8]) -> Self { Self { length, From 179c1ca6aa2e20210119487f56e7cb47a93be4f0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 30 Sep 2024 14:15:40 -0400 Subject: [PATCH 4/7] Moar inline --- arrow-data/src/byte_view.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 84ab5f853ec4..a9e08f8be27a 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -93,12 +93,14 @@ impl ByteView { } /// Set the [`Self::buffer_index`] field + #[inline] pub fn with_buffer_index(mut self, buffer_index: u32) -> Self { self.buffer_index = buffer_index; self } /// Set the [`Self::offset`] field + #[inline] pub fn with_offset(mut self, offset: u32) -> Self { self.offset = offset; self From 338d7f7b1420f371e0eb5a2ee183093f1d06f885 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Oct 2024 06:19:28 -0400 Subject: [PATCH 5/7] Update arrow-data/src/byte_view.rs Co-authored-by: Xiangpeng Hao --- arrow-data/src/byte_view.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index a9e08f8be27a..01060502adbb 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -84,6 +84,7 @@ impl ByteView { /// If the prefix is not exactly 4 bytes #[inline] pub fn new(length: u32, prefix: &[u8]) -> Self { + debug_assert!(length > 12); Self { length, prefix: u32::from_le_bytes(prefix.try_into().unwrap()), From 522e93f1f36be2a1e5d35df6c967cc3e9c86465c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Oct 2024 06:21:12 -0400 Subject: [PATCH 6/7] fmt --- arrow-data/src/byte_view.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 01060502adbb..3b3ec6246066 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -84,7 +84,7 @@ impl ByteView { /// If the prefix is not exactly 4 bytes #[inline] pub fn new(length: u32, prefix: &[u8]) -> Self { - debug_assert!(length > 12); + debug_assert!(length > 12); Self { length, prefix: u32::from_le_bytes(prefix.try_into().unwrap()), From 41ce41fa245ab32a8ef30efef49cde36ea84222c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 1 Oct 2024 15:31:41 -0400 Subject: [PATCH 7/7] Update test so it fails at the right place --- arrow-array/src/array/byte_view_array.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index fa3ec3d7692d..b1b5580577ab 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -888,8 +888,12 @@ mod tests { expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0" )] fn new_with_invalid_utf8_data() { - let v: Vec = vec![0xf0, 0x80, 0x80, 0x80]; - let view = ByteView::new(v.len() as u32, &v); + let v: Vec = vec![ + // invalid UTF8 + 0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + let view = ByteView::new(v.len() as u32, &v[0..4]); let views = ScalarBuffer::from(vec![view.into()]); let buffers = vec![Buffer::from_slice_ref(v)]; StringViewArray::new(views, buffers, None);