Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Revert "MutableDictionaryArray rewrite: use values stored in the arra…
Browse files Browse the repository at this point in the history
…y instead of the hash->hash map" (#1559)
  • Loading branch information
ritchie46 authored Sep 3, 2023
1 parent cf9ec83 commit b2017d7
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 469 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
simdutf8 = "0.1.4"

# A Rust port of SwissTable
hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }
hashbrown = { version = "0.14", default-features = false, optional = true }

# for timezone support
chrono-tz = { version = "0.8", optional = true }
Expand Down Expand Up @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
compute_partition = ["compute_sort"]
compute_regex_match = ["regex"]
compute_sort = ["compute_take"]
compute_sort = ["compute_take", "hashbrown"]
compute_substring = []
compute_take = []
compute_temporal = []
Expand Down
1 change: 0 additions & 1 deletion src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ mod iterator;
mod mutable;
use crate::array::specification::check_indexes_unchecked;
mod typed_iterator;
mod value_map;

use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
pub use iterator::*;
Expand Down
135 changes: 78 additions & 57 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use std::hash::Hash;
use std::sync::Arc;
use std::hash::{Hash, Hasher};
use std::{collections::hash_map::DefaultHasher, sync::Arc};

use hash_hasher::HashedMap;

use crate::array::indexable::{AsIndexed, Indexable};
use crate::{
array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
bitmap::MutableBitmap,
datatypes::DataType,
error::Result,
error::{Error, Result},
};

use super::value_map::ValueMap;
use super::{DictionaryArray, DictionaryKey};

/// A mutable, strong-typed version of [`DictionaryArray`].
Expand All @@ -30,29 +30,55 @@ use super::{DictionaryArray, DictionaryKey};
#[derive(Debug)]
pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
data_type: DataType,
map: ValueMap<K, M>,
// invariant: `max(keys) < map.values().len()`
keys: MutablePrimitiveArray<K>,
map: HashedMap<u64, K>,
// invariant: `keys.len() <= values.len()`
values: M,
}

impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
fn from(other: MutableDictionaryArray<K, M>) -> Self {
fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
// Safety - the invariant of this struct ensures that this is up-held
unsafe {
DictionaryArray::<K>::try_new_unchecked(
other.data_type,
other.keys.into(),
other.map.into_boxed().as_box(),
other.values.as_box(),
)
.unwrap()
}
}
}

impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
}
}

impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
/// Creates an empty [`MutableDictionaryArray`].
pub fn new() -> Self {
Self::try_empty(M::default()).unwrap()
let values = M::default();
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
}
}

Expand All @@ -63,34 +89,22 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
}

impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
/// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
/// # Errors
/// Errors if the array is non-empty.
pub fn try_empty(values: M) -> Result<Self> {
Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
}

/// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
/// Indices associated with those values are automatically assigned based on the order of
/// the values.
/// # Errors
/// Errors if there's more values than the maximum value of `K`.
pub fn from_values(values: M) -> Result<Self>
where
M: Indexable,
M::Type: Eq + Hash,
{
Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
}

fn from_value_map(value_map: ValueMap<K, M>) -> Self {
let keys = MutablePrimitiveArray::<K>::new();
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
Self {
data_type,
map: value_map,
keys,
/// Returns whether the value should be pushed to the values or not
fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
let hash = hasher.finish();
match self.map.get(&hash) {
Some(key) => {
self.keys.push(Some(*key));
Ok(false)
}
None => {
let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
self.map.insert(hash, key);
self.keys.push(Some(key));
Ok(true)
}
}
}

Expand All @@ -99,9 +113,14 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
self.keys.push(None)
}

/// returns a mutable reference to the inner values.
fn mut_values(&mut self) -> &mut M {
&mut self.values
}

/// returns a reference to the inner values.
pub fn values(&self) -> &M {
self.map.values()
&self.values
}

/// converts itself into [`Arc<dyn Array>`]
Expand All @@ -123,10 +142,15 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {

/// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
pub fn shrink_to_fit(&mut self) {
self.map.shrink_to_fit();
self.values.shrink_to_fit();
self.keys.shrink_to_fit();
}

/// Returns the dictionary map
pub fn map(&self) -> &HashedMap<u64, K> {
&self.map
}

/// Returns the dictionary keys
pub fn keys(&self) -> &MutablePrimitiveArray<K> {
&self.keys
Expand All @@ -136,7 +160,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
DictionaryArray::<K>::try_new(
self.data_type.clone(),
std::mem::take(&mut self.keys).into(),
self.map.take_into(),
self.values.as_box(),
)
.unwrap()
}
Expand Down Expand Up @@ -184,20 +208,17 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
}
}

impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + Indexable + TryExtend<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
M: MutableArray + TryExtend<Option<T>>,
{
fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
for value in iter {
if let Some(value) = value {
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
self.keys.try_push(Some(key))?;
if self.try_push_valid(&value)? {
self.mut_values().try_extend(std::iter::once(Some(value)))?;
}
} else {
self.push_null();
}
Expand All @@ -209,19 +230,19 @@ where
impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + Indexable + TryPush<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
M: MutableArray + TryPush<Option<T>>,
T: Hash,
{
fn try_push(&mut self, item: Option<T>) -> Result<()> {
if let Some(value) = item {
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
self.keys.try_push(Some(key))?;
if self.try_push_valid(&value)? {
self.values.try_push(Some(value))
} else {
Ok(())
}
} else {
self.push_null();
Ok(())
}
Ok(())
}
}
Loading

0 comments on commit b2017d7

Please sign in to comment.