From b2017d7cc3611cd9d578bc675ebc3fe176d3a907 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 3 Sep 2023 14:30:49 +0200 Subject: [PATCH] Revert "MutableDictionaryArray rewrite: use values stored in the array instead of the hash->hash map" (#1559) --- Cargo.toml | 4 +- src/array/dictionary/mod.rs | 1 - src/array/dictionary/mutable.rs | 135 +++++++++-------- src/array/dictionary/value_map.rs | 207 --------------------------- src/array/indexable.rs | 197 ------------------------- src/array/mod.rs | 4 +- src/compute/cast/primitive_to.rs | 4 +- tests/it/array/dictionary/mutable.rs | 15 ++ 8 files changed, 98 insertions(+), 469 deletions(-) delete mode 100644 src/array/dictionary/value_map.rs delete mode 100644 src/array/indexable.rs diff --git a/Cargo.toml b/Cargo.toml index 1bb20a69556..0f3f9ec27bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3" simdutf8 = "0.1.4" # A Rust port of SwissTable -hashbrown = { version = "0.14", default-features = false, features = ["ahash"] } +hashbrown = { version = "0.14", default-features = false, optional = true } # for timezone support chrono-tz = { version = "0.8", optional = true } @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] compute_partition = ["compute_sort"] compute_regex_match = ["regex"] -compute_sort = ["compute_take"] +compute_sort = ["compute_take", "hashbrown"] compute_substring = [] compute_take = [] compute_temporal = [] diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index a4be1ea2107..f7d4a0f43d7 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -20,7 +20,6 @@ mod iterator; mod mutable; use crate::array::specification::check_indexes_unchecked; mod typed_iterator; -mod value_map; use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped}; pub use iterator::*; diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 16a00a1357b..444de34bcc4 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -1,15 +1,15 @@ -use std::hash::Hash; -use std::sync::Arc; +use std::hash::{Hash, Hasher}; +use std::{collections::hash_map::DefaultHasher, sync::Arc}; + +use hash_hasher::HashedMap; -use crate::array::indexable::{AsIndexed, Indexable}; use crate::{ array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, datatypes::DataType, - error::Result, + error::{Error, Result}, }; -use super::value_map::ValueMap; use super::{DictionaryArray, DictionaryKey}; /// A mutable, strong-typed version of [`DictionaryArray`]. @@ -30,29 +30,55 @@ use super::{DictionaryArray, DictionaryKey}; #[derive(Debug)] pub struct MutableDictionaryArray { data_type: DataType, - map: ValueMap, - // invariant: `max(keys) < map.values().len()` keys: MutablePrimitiveArray, + map: HashedMap, + // invariant: `keys.len() <= values.len()` + values: M, } impl From> for DictionaryArray { - fn from(other: MutableDictionaryArray) -> Self { + fn from(mut other: MutableDictionaryArray) -> Self { // Safety - the invariant of this struct ensures that this is up-held unsafe { DictionaryArray::::try_new_unchecked( other.data_type, other.keys.into(), - other.map.into_boxed().as_box(), + other.values.as_box(), ) .unwrap() } } } +impl From for MutableDictionaryArray { + fn from(values: M) -> Self { + Self { + data_type: DataType::Dictionary( + K::KEY_TYPE, + Box::new(values.data_type().clone()), + false, + ), + keys: MutablePrimitiveArray::::new(), + map: HashedMap::default(), + values, + } + } +} + impl MutableDictionaryArray { /// Creates an empty [`MutableDictionaryArray`]. pub fn new() -> Self { - Self::try_empty(M::default()).unwrap() + let values = M::default(); + Self { + data_type: DataType::Dictionary( + K::KEY_TYPE, + Box::new(values.data_type().clone()), + false, + ), + keys: MutablePrimitiveArray::::new(), + map: HashedMap::default(), + values, + } } } @@ -63,34 +89,22 @@ impl Default for MutableDictionaryA } impl MutableDictionaryArray { - /// Creates an empty [`MutableDictionaryArray`] from a given empty values array. - /// # Errors - /// Errors if the array is non-empty. - pub fn try_empty(values: M) -> Result { - Ok(Self::from_value_map(ValueMap::::try_empty(values)?)) - } - - /// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values. - /// Indices associated with those values are automatically assigned based on the order of - /// the values. - /// # Errors - /// Errors if there's more values than the maximum value of `K`. - pub fn from_values(values: M) -> Result - where - M: Indexable, - M::Type: Eq + Hash, - { - Ok(Self::from_value_map(ValueMap::::from_values(values)?)) - } - - fn from_value_map(value_map: ValueMap) -> Self { - let keys = MutablePrimitiveArray::::new(); - let data_type = - DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false); - Self { - data_type, - map: value_map, - keys, + /// Returns whether the value should be pushed to the values or not + fn try_push_valid(&mut self, value: &T) -> Result { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + let hash = hasher.finish(); + match self.map.get(&hash) { + Some(key) => { + self.keys.push(Some(*key)); + Ok(false) + } + None => { + let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?; + self.map.insert(hash, key); + self.keys.push(Some(key)); + Ok(true) + } } } @@ -99,9 +113,14 @@ impl MutableDictionaryArray { self.keys.push(None) } + /// returns a mutable reference to the inner values. + fn mut_values(&mut self) -> &mut M { + &mut self.values + } + /// returns a reference to the inner values. pub fn values(&self) -> &M { - self.map.values() + &self.values } /// converts itself into [`Arc`] @@ -123,10 +142,15 @@ impl MutableDictionaryArray { /// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { - self.map.shrink_to_fit(); + self.values.shrink_to_fit(); self.keys.shrink_to_fit(); } + /// Returns the dictionary map + pub fn map(&self) -> &HashedMap { + &self.map + } + /// Returns the dictionary keys pub fn keys(&self) -> &MutablePrimitiveArray { &self.keys @@ -136,7 +160,7 @@ impl MutableDictionaryArray { DictionaryArray::::try_new( self.data_type.clone(), std::mem::take(&mut self.keys).into(), - self.map.take_into(), + self.values.as_box(), ) .unwrap() } @@ -184,20 +208,17 @@ impl MutableArray for MutableDictio } } -impl TryExtend> for MutableDictionaryArray +impl TryExtend> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + Indexable + TryExtend>, - T: AsIndexed, - M::Type: Eq + Hash, + M: MutableArray + TryExtend>, { fn try_extend>>(&mut self, iter: II) -> Result<()> { for value in iter { if let Some(value) = value { - let key = self - .map - .try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?; - self.keys.try_push(Some(key))?; + if self.try_push_valid(&value)? { + self.mut_values().try_extend(std::iter::once(Some(value)))?; + } } else { self.push_null(); } @@ -209,19 +230,19 @@ where impl TryPush> for MutableDictionaryArray where K: DictionaryKey, - M: MutableArray + Indexable + TryPush>, - T: AsIndexed, - M::Type: Eq + Hash, + M: MutableArray + TryPush>, + T: Hash, { fn try_push(&mut self, item: Option) -> Result<()> { if let Some(value) = item { - let key = self - .map - .try_push_valid(value, |arr, v| arr.try_push(Some(v)))?; - self.keys.try_push(Some(key))?; + if self.try_push_valid(&value)? { + self.values.try_push(Some(value)) + } else { + Ok(()) + } } else { self.push_null(); + Ok(()) } - Ok(()) } } diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs deleted file mode 100644 index 35603de4cf2..00000000000 --- a/src/array/dictionary/value_map.rs +++ /dev/null @@ -1,207 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug}; -use std::hash::{Hash, Hasher}; -use std::pin::Pin; -use std::ptr::NonNull; - -use hashbrown::{Equivalent, HashMap}; - -use crate::array::Array; -use crate::{ - array::indexable::{AsIndexed, Indexable}, - array::MutableArray, - datatypes::DataType, - error::{Error, Result}, -}; - -use super::DictionaryKey; - -struct NonNullSend(NonNull); - -// safety: these pointers are for internal self-referential purposes to pinned array only -unsafe impl Send for NonNullSend {} -unsafe impl Sync for NonNullSend {} - -impl From<&M> for NonNullSend { - #[inline] - fn from(reference: &M) -> Self { - Self(NonNull::from(reference)) - } -} - -struct ValueRef { - array: NonNullSend, - index: usize, -} - -impl ValueRef { - #[inline] - pub fn new(array: &Pin>, index: usize) -> Self { - Self { - array: NonNullSend::from(Pin::get_ref(array.as_ref())), - index, - } - } - - #[inline] - pub fn get_array(&self) -> &M { - // safety: the invariant of the struct - unsafe { self.array.0.as_ref() } - } - - #[inline] - pub unsafe fn get_unchecked(&self) -> M::Value<'_> - where - M: Indexable, - { - self.get_array().value_unchecked_at(self.index) - } - - #[inline] - pub unsafe fn equals(&self, other: &M::Type) -> bool - where - M: Indexable, - M::Type: Eq, - { - self.get_unchecked().borrow() == other - } -} - -impl PartialEq for ValueRef -where - M::Type: PartialEq, -{ - #[inline] - fn eq(&self, other: &Self) -> bool { - // safety: the way these value refs are constructed, they are always within bounds - unsafe { - self.get_unchecked() - .borrow() - .eq(other.get_unchecked().borrow()) - } - } -} - -impl Eq for ValueRef where for<'a> M::Type: Eq {} - -impl Hash for ValueRef -where - M::Type: Hash, -{ - #[inline] - fn hash(&self, state: &mut H) { - // safety: the way these value refs are constructed, they are always within bounds - unsafe { self.get_unchecked().borrow().hash(state) } - } -} - -// To avoid blanket implementation issues with `Equivalent` trait (we only use hashbrown -// instead of the default HashMap to avoid blanket implementation problems with Borrow). -#[derive(Hash)] -struct Wrapped<'a, T: ?Sized>(&'a T); - -impl<'a, M: Indexable> Equivalent> for Wrapped<'a, M::Type> -where - M::Type: Eq, -{ - #[inline] - fn equivalent(&self, key: &ValueRef) -> bool { - // safety: invariant of the struct - unsafe { key.equals(self.0) } - } -} - -pub struct ValueMap { - values: Pin>, - map: HashMap, K>, -} - -impl ValueMap { - pub fn try_empty(values: M) -> Result { - if !values.is_empty() { - return Err(Error::InvalidArgumentError( - "initializing value map with non-empty values array".into(), - )); - } - Ok(Self { - values: Box::pin(values), - map: HashMap::default(), - }) - } - - pub fn from_values(values: M) -> Result - where - M: Indexable, - M::Type: Eq + Hash, - { - let values = Box::pin(values); - let map = (0..values.len()) - .map(|i| { - let key = K::try_from(i).map_err(|_| Error::Overflow)?; - Ok((ValueRef::new(&values, i), key)) - }) - .collect::>()?; - Ok(Self { values, map }) - } - - pub fn data_type(&self) -> &DataType { - Pin::get_ref(self.values.as_ref()).data_type() - } - - pub fn into_boxed(self) -> Box { - // safety: we unpin the pointer but the value map is dropped along with all - // the value references that might refer to the pinned array - unsafe { Pin::into_inner_unchecked(self.values) } - } - - pub fn take_into(&mut self) -> Box { - // safety: we unpin the pointer but the value map is manually cleared - let arr = unsafe { self.values.as_mut().get_unchecked_mut().as_box() }; - self.map.clear(); - arr - } - - #[inline] - pub fn values(&self) -> &M { - &self.values - } - - /// Try to insert a value and return its index (it may or may not get inserted). - pub fn try_push_valid( - &mut self, - value: V, - mut push: impl FnMut(&mut M, V) -> Result<()>, - ) -> Result - where - M: Indexable, - V: AsIndexed, - M::Type: Eq + Hash, - { - if let Some(&key) = self.map.get(&Wrapped(value.as_indexed())) { - return Ok(key); - } - let index = self.values.len(); - let key = K::try_from(index).map_err(|_| Error::Overflow)?; - // safety: we don't move the data out of the mutable pinned reference - unsafe { - push(self.values.as_mut().get_unchecked_mut(), value)?; - } - debug_assert_eq!(self.values.len(), index + 1); - self.map.insert(ValueRef::new(&self.values, index), key); - debug_assert_eq!(self.values.len(), self.map.len()); - Ok(key) - } - - pub fn shrink_to_fit(&mut self) { - // safety: we don't move the data out of the mutable pinned reference - unsafe { - self.values.as_mut().get_unchecked_mut().shrink_to_fit(); - } - } -} - -impl Debug for ValueMap { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Pin::get_ref(self.values.as_ref()).fmt(f) - } -} diff --git a/src/array/indexable.rs b/src/array/indexable.rs deleted file mode 100644 index 76001bfcf5c..00000000000 --- a/src/array/indexable.rs +++ /dev/null @@ -1,197 +0,0 @@ -use std::borrow::Borrow; - -use crate::{ - array::{ - MutableArray, MutableBinaryArray, MutableBinaryValuesArray, MutableBooleanArray, - MutableFixedSizeBinaryArray, MutablePrimitiveArray, MutableUtf8Array, - MutableUtf8ValuesArray, - }, - offset::Offset, - types::NativeType, -}; - -/// Trait for arrays that can be indexed directly to extract a value. -pub trait Indexable { - /// The type of the element at index `i`; may be a reference type or a value type. - type Value<'a>: Borrow - where - Self: 'a; - - type Type: ?Sized; - - /// Returns the element at index `i`. - /// # Panic - /// May panic if `i >= self.len()`. - fn value_at(&self, index: usize) -> Self::Value<'_>; - - /// Returns the element at index `i`. - /// # Safety - /// Assumes that the `i < self.len`. - #[inline] - unsafe fn value_unchecked_at(&self, index: usize) -> Self::Value<'_> { - self.value_at(index) - } -} - -pub trait AsIndexed { - fn as_indexed(&self) -> &M::Type; -} - -impl Indexable for MutableBooleanArray { - type Value<'a> = bool; - type Type = bool; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.values().get(i) - } -} - -impl AsIndexed for bool { - #[inline] - fn as_indexed(&self) -> &bool { - self - } -} - -impl Indexable for MutableBinaryArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - // TODO: add .value() / .value_unchecked() to MutableBinaryArray? - assert!(i < self.len()); - unsafe { self.value_unchecked_at(i) } - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - // TODO: add .value() / .value_unchecked() to MutableBinaryArray? - // soundness: the invariant of the function - let (start, end) = self.offsets().start_end_unchecked(i); - // soundness: the invariant of the struct - self.values().get_unchecked(start..end) - } -} - -impl AsIndexed> for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -impl Indexable for MutableBinaryValuesArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl AsIndexed> for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -impl Indexable for MutableFixedSizeBinaryArray { - type Value<'a> = &'a [u8]; - type Type = [u8]; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - // soundness: the invariant of the struct - self.value_unchecked(i) - } -} - -impl AsIndexed for &[u8] { - #[inline] - fn as_indexed(&self) -> &[u8] { - self - } -} - -// TODO: should NativeType derive from Hash? -impl Indexable for MutablePrimitiveArray { - type Value<'a> = T; - type Type = T; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - assert!(i < self.len()); - // TODO: add Length trait? (for both Array and MutableArray) - unsafe { self.value_unchecked_at(i) } - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - *self.values().get_unchecked(i) - } -} - -impl AsIndexed> for T { - #[inline] - fn as_indexed(&self) -> &T { - self - } -} - -impl Indexable for MutableUtf8Array { - type Value<'a> = &'a str; - type Type = str; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl> AsIndexed> for V { - #[inline] - fn as_indexed(&self) -> &str { - self.as_ref() - } -} - -impl Indexable for MutableUtf8ValuesArray { - type Value<'a> = &'a str; - type Type = str; - - #[inline] - fn value_at(&self, i: usize) -> Self::Value<'_> { - self.value(i) - } - - #[inline] - unsafe fn value_unchecked_at(&self, i: usize) -> Self::Value<'_> { - self.value_unchecked(i) - } -} - -impl> AsIndexed> for V { - #[inline] - fn as_indexed(&self) -> &str { - self.as_ref() - } -} diff --git a/src/array/mod.rs b/src/array/mod.rs index 1575130989d..04b7b2c8e35 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -720,10 +720,8 @@ mod utf8; mod equal; mod ffi; mod fmt; -mod indexable; -mod iterator; - pub mod growable; +mod iterator; pub mod ord; pub(crate) use iterator::ArrayAccessor; diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 110288817a7..30b265e2d59 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -306,9 +306,9 @@ pub fn primitive_to_dictionary( from: &PrimitiveArray, ) -> Result> { let iter = from.iter().map(|x| x.copied()); - let mut array = MutableDictionaryArray::::try_empty(MutablePrimitiveArray::::from( + let mut array = MutableDictionaryArray::::from(MutablePrimitiveArray::::from( from.data_type().clone(), - ))?; + )); array.try_extend(iter)?; Ok(array.into()) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index 1b54a926471..b6103dcccfc 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -1,5 +1,8 @@ use arrow2::array::*; use arrow2::error::Result; +use hash_hasher::HashedMap; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; #[test] fn primitive() -> Result<()> { @@ -58,4 +61,16 @@ fn push_utf8() { expected_keys.push(Some(0)); expected_keys.push(Some(1)); assert_eq!(*new.keys(), expected_keys); + + let expected_map = ["A", "B", "C"] + .iter() + .enumerate() + .map(|(index, value)| { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + let hash = hasher.finish(); + (hash, index as i32) + }) + .collect::>(); + assert_eq!(*new.map(), expected_map); }