From 34f58f96d421302aa2c67b689ff1a63ac77fa1a1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 22 Jul 2022 10:02:24 -0700 Subject: [PATCH] Add validation for Decimal256 (#2113) * Add validation for decimal256 * Add validation in array data --- arrow/src/array/array_decimal.rs | 2 +- arrow/src/array/builder/decimal_builder.rs | 67 +++++++++- arrow/src/array/data.rs | 17 ++- arrow/src/datatypes/datatype.rs | 144 ++++++++++++++++++++- 4 files changed, 225 insertions(+), 5 deletions(-) diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index f42192658789..494b4e9d1a99 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -352,7 +352,7 @@ impl From for Decimal256Array { assert_eq!( data.buffers().len(), 1, - "Decimal128Array data should contain 1 buffer only (values)" + "Decimal256Array data should contain 1 buffer only (values)" ); let values = data.buffers()[0].as_ptr(); let (precision, scale) = match data.data_type() { diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index 42623df95960..d015d3dcecda 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use num::BigInt; use std::any::Any; use std::sync::Arc; @@ -25,7 +26,7 @@ use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder}; use crate::error::{ArrowError, Result}; -use crate::datatypes::validate_decimal_precision; +use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision}; use crate::util::decimal::{BasicDecimal, Decimal256}; /// Array Builder for [`Decimal128Array`] @@ -51,6 +52,10 @@ pub struct Decimal256Builder { builder: FixedSizeBinaryBuilder, precision: usize, scale: usize, + + /// Should decimal values be validated for compatibility with scale and precision? + /// defaults to true + value_validation: bool, } impl Decimal128Builder { @@ -163,14 +168,35 @@ impl Decimal256Builder { builder: FixedSizeBinaryBuilder::new(capacity, Self::BYTE_LENGTH), precision, scale, + value_validation: true, } } + /// Disable validation + /// + /// # Safety + /// + /// After disabling validation, caller must ensure that appended values are compatible + /// for the specified precision and scale. + pub unsafe fn disable_value_validation(&mut self) { + self.value_validation = false; + } + /// Appends a [`Decimal256`] number into the builder. /// /// Returns an error if `value` has different precision, scale or length in bytes than this builder #[inline] pub fn append_value(&mut self, value: &Decimal256) -> Result<()> { + let value = if self.value_validation { + let raw_bytes = value.raw_value(); + let integer = BigInt::from_signed_bytes_le(raw_bytes); + let value_str = integer.to_string(); + validate_decimal256_precision(&value_str, self.precision)?; + value + } else { + value + }; + if self.precision != value.precision() || self.scale != value.scale() { return Err(ArrowError::InvalidArgumentError( "Decimal value does not have the same precision or scale as Decimal256Builder".to_string() @@ -206,9 +232,10 @@ impl Decimal256Builder { #[cfg(test)] mod tests { use super::*; + use num::Num; use crate::array::array_decimal::{BasicDecimalArray, Decimal128Array}; - use crate::array::Array; + use crate::array::{array_decimal, Array}; use crate::datatypes::DataType; use crate::util::decimal::Decimal128; @@ -298,4 +325,40 @@ mod tests { let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); builder.append_value(&value).unwrap(); } + + #[test] + #[should_panic( + expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" + )] + fn test_decimal256_builder_out_of_range_precision_scale() { + let mut builder = Decimal256Builder::new(30, 76, 6); + + let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); + let bytes = big_value.to_signed_bytes_le(); + let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap(); + builder.append_value(&value).unwrap(); + } + + #[test] + #[should_panic( + expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" + )] + fn test_decimal256_data_validation() { + let mut builder = Decimal256Builder::new(30, 76, 6); + // Disable validation at builder + unsafe { + builder.disable_value_validation(); + } + + let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); + let bytes = big_value.to_signed_bytes_le(); + let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap(); + builder + .append_value(&value) + .expect("should not validate invalid value at builder"); + + let array = builder.finish(); + let array_data = array_decimal::BasicDecimalArray::data(&array); + array_data.validate_values().unwrap(); + } } diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 3cd42e36619c..4ae7f069e2d2 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -18,7 +18,10 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::datatypes::{validate_decimal_precision, DataType, IntervalUnit, UnionMode}; +use crate::datatypes::{ + validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit, + UnionMode, +}; use crate::error::{ArrowError, Result}; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ @@ -26,6 +29,7 @@ use crate::{ util::bit_util, }; use half::f16; +use num::BigInt; use std::convert::TryInto; use std::mem; use std::ops::Range; @@ -1018,6 +1022,17 @@ impl ArrayData { } Ok(()) } + DataType::Decimal256(p, _) => { + let values = self.buffers()[0].as_slice(); + for pos in 0..self.len() { + let offset = pos * 32; + let raw_bytes = &values[offset..offset + 32]; + let integer = BigInt::from_signed_bytes_le(raw_bytes); + let value_str = integer.to_string(); + validate_decimal256_precision(&value_str, *p)?; + } + Ok(()) + } DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 8f787b97a901..74a2ab45080a 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use num::{BigInt, Num, ToPrimitive}; use std::fmt; use serde_derive::{Deserialize, Serialize}; @@ -300,6 +301,49 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 99999999999999999999999999999999999999, ]; +/// `MAX_DECIMAL_FOR_LARGER_PRECISION[p]` holds the maximum integer value +/// that can be stored in [DataType::Decimal256] value of precision `p` > 38 +pub const MAX_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ + "99999999999999999999999999999999999999", + "999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999999999999999", + "9999999999999999999999999999999999999999999999999999999999999999999999999", + "99999999999999999999999999999999999999999999999999999999999999999999999999", + "999999999999999999999999999999999999999999999999999999999999999999999999999", +]; + /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value /// that can be stored in a [DataType::Decimal] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ @@ -343,13 +387,62 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; +/// `MIN_DECIMAL_FOR_LARGER_PRECISION[p]` holds the minimum integer value +/// that can be stored in a [DataType::Decimal256] value of precision `p` > 38 +pub const MIN_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ + "-99999999999999999999999999999999999999", + "-999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999999999999999", + "-9999999999999999999999999999999999999999999999999999999999999999999999999", + "-99999999999999999999999999999999999999999999999999999999999999999999999999", + "-999999999999999999999999999999999999999999999999999999999999999999999999999", +]; + /// The maximum precision for [DataType::Decimal] values pub const DECIMAL_MAX_PRECISION: usize = 38; /// The maximum scale for [DataType::Decimal] values pub const DECIMAL_MAX_SCALE: usize = 38; -/// The default scale for [DataType::Decimal] values +/// The maximum precision for [DataType::Decimal256] values +pub const DECIMAL256_MAX_PRECISION: usize = 76; + +/// The maximum scale for [DataType::Decimal256] values +pub const DECIMAL256_MAX_SCALE: usize = 76; + +/// The default scale for [DataType::Decimal] and [DataType::Decimal256] values pub const DECIMAL_DEFAULT_SCALE: usize = 10; /// Validates that the specified `i128` value can be properly @@ -379,6 +472,55 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul } } +/// Validates that the specified string value can be properly +/// interpreted as a Decimal256 number with precision `precision` +#[inline] +pub(crate) fn validate_decimal256_precision( + value: &str, + precision: usize, +) -> Result { + if precision > 38 { + let max_str = MAX_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; + let min_str = MIN_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; + + let max = BigInt::from_str_radix(max_str, 10).unwrap(); + let min = BigInt::from_str_radix(min_str, 10).unwrap(); + + let value = BigInt::from_str_radix(value, 10).unwrap(); + if value > max { + Err(ArrowError::InvalidArgumentError(format!( + "{} is too large to store in a Decimal256 of precision {}. Max is {}", + value, precision, max + ))) + } else if value < min { + Err(ArrowError::InvalidArgumentError(format!( + "{} is too small to store in a Decimal256 of precision {}. Min is {}", + value, precision, min + ))) + } else { + Ok(value) + } + } else { + let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1]; + let min = MIN_DECIMAL_FOR_EACH_PRECISION[precision - 1]; + let value = BigInt::from_str_radix(value, 10).unwrap(); + + if value.to_i128().unwrap() > max { + Err(ArrowError::InvalidArgumentError(format!( + "{} is too large to store in a Decimal256 of precision {}. Max is {}", + value, precision, max + ))) + } else if value.to_i128().unwrap() < min { + Err(ArrowError::InvalidArgumentError(format!( + "{} is too small to store in a Decimal256 of precision {}. Min is {}", + value, precision, min + ))) + } else { + Ok(value) + } + } +} + impl DataType { /// Parse a data type from a JSON representation. pub(crate) fn from(json: &Value) -> Result {