diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index d6c5dd4c3e1..bdb6f0d4b5c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -1072,6 +1072,74 @@ mod tests { assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4])); } + #[test] + fn test_dictionary_builder_append_many() { + let mut builder = PrimitiveDictionaryBuilder::::new(); + + builder.append(1).unwrap(); + builder.append_n(2, 2).unwrap(); + builder.append_options(None, 2); + builder.append_options(Some(3), 3); + + let array = builder.finish(); + + let values = array + .values() + .as_primitive::() + .iter() + .map(Option::unwrap) + .collect::>(); + assert_eq!(values, &[1, 2, 3]); + let keys = array.keys().iter().collect::>(); + assert_eq!( + keys, + &[ + Some(0), + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + Some(2) + ] + ); + } + + #[test] + fn test_string_dictionary_builder_append_many() { + let mut builder = StringDictionaryBuilder::::new(); + + builder.append("a").unwrap(); + builder.append_n("b", 2).unwrap(); + builder.append_options(None::<&str>, 2); + builder.append_options(Some("c"), 3); + + let array = builder.finish(); + + let values = array + .values() + .as_string::() + .iter() + .map(Option::unwrap) + .collect::>(); + assert_eq!(values, &["a", "b", "c"]); + let keys = array.keys().iter().collect::>(); + assert_eq!( + keys, + &[ + Some(0), + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + Some(2) + ] + ); + } + #[test] fn test_dictionary_array_fmt_debug() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 128a4f82066..a327c622a75 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -195,12 +195,7 @@ where K: ArrowDictionaryKeyType, T: ByteArrayType, { - /// Append a value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - /// - /// Returns an error if the new index would overflow the key type. - pub fn append(&mut self, value: impl AsRef) -> Result { + fn get_or_insert_key(&mut self, value: impl AsRef) -> Result { let value_native: &T::Native = value.as_ref(); let value_bytes: &[u8] = value_native.as_ref(); @@ -223,8 +218,32 @@ where .get(); let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?; + + Ok(key) + } + + /// Append a value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append(&mut self, value: impl AsRef) -> Result { + let key = self.get_or_insert_key(value)?; self.keys_builder.append_value(key); + Ok(key) + } + /// Append a value multiple times to the array. + /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append_n( + &mut self, + value: impl AsRef, + count: usize, + ) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value_n(key, count); Ok(key) } @@ -237,6 +256,17 @@ where self.append(value).expect("dictionary key overflow"); } + /// Infallibly append a value to this builder repeatedly `count` times. + /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_values(&mut self, value: impl AsRef, count: usize) { + self.append_n(value, count) + .expect("dictionary key overflow"); + } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { @@ -256,6 +286,19 @@ where }; } + /// Append an `Option` value into the builder repeatedly `count` times. + /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_options(&mut self, value: Option>, count: usize) { + match value { + None => self.keys_builder.append_nulls(count), + Some(v) => self.append_values(v, count), + }; + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -331,8 +374,7 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ /// // The builder builds the dictionary value by value /// builder.append("abc").unwrap(); /// builder.append_null(); -/// builder.append("def").unwrap(); -/// builder.append("def").unwrap(); +/// builder.append_n("def", 2).unwrap(); // appends "def" twice with a single lookup /// builder.append("abc").unwrap(); /// let array = builder.finish(); /// diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 39b27bfca89..3191fea6e40 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -202,6 +202,13 @@ impl PrimitiveBuilder { self.values_builder.append(v); } + /// Appends a value of type `T` into the builder `n` times + #[inline] + pub fn append_value_n(&mut self, v: T::Native, n: usize) { + self.null_buffer_builder.append_n_non_nulls(n); + self.values_builder.append_n(n, v); + } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index a764fa4c29c..35abe5ba5fb 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -21,7 +21,6 @@ use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; -use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::Arc; @@ -210,26 +209,41 @@ where K: ArrowDictionaryKeyType, V: ArrowPrimitiveType, { - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. #[inline] - pub fn append(&mut self, value: V::Native) -> Result { - let key = match self.map.entry(Value(value)) { - Entry::Vacant(vacant) => { - // Append new value. + fn get_or_insert_key(&mut self, value: V::Native) -> Result { + match self.map.get(&Value(value)) { + Some(&key) => { + Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) + } + None => { let key = self.values_builder.len(); self.values_builder.append_value(value); - vacant.insert(key); - K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)? + self.map.insert(Value(value), key); + Ok(K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?) } - Entry::Occupied(o) => K::Native::usize_as(*o.get()), - }; + } + } + /// Append a primitive value to the array. Return an existing index + /// if already present in the values array or a new index if the + /// value is appended to the values array. + #[inline] + pub fn append(&mut self, value: V::Native) -> Result { + let key = self.get_or_insert_key(value)?; self.keys_builder.append_value(key); Ok(key) } + /// Append a value multiple times to the array. + /// This is the same as `append` but allows to append the same value multiple times without doing multiple lookups. + /// + /// Returns an error if the new index would overflow the key type. + pub fn append_n(&mut self, value: V::Native, count: usize) -> Result { + let key = self.get_or_insert_key(value)?; + self.keys_builder.append_value_n(key, count); + Ok(key) + } + /// Infallibly append a value to this builder /// /// # Panics @@ -240,6 +254,17 @@ where self.append(value).expect("dictionary key overflow"); } + /// Infallibly append a value to this builder repeatedly `count` times. + /// This is the same as `append_value` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_values(&mut self, value: V::Native, count: usize) { + self.append_n(value, count) + .expect("dictionary key overflow"); + } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { @@ -259,6 +284,19 @@ where }; } + /// Append an `Option` value into the builder repeatedly `count` times. + /// This is the same as `append_option` but allows to append the same value multiple times without doing multiple lookups. + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_options(&mut self, value: Option, count: usize) { + match value { + None => self.keys_builder.append_nulls(count), + Some(v) => self.append_values(v, count), + }; + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear();