Skip to content

Commit

Permalink
Initial support for StringView, merge changes from string-view de…
Browse files Browse the repository at this point in the history
…velopment branch (#11402)

* Update `string-view` branch to arrow-rs main (#10966)

* Pin to arrow main

* Fix clippy with latest arrow

* Uncomment test that needs new arrow-rs to work

* Update datafusion-cli Cargo.lock

* Update Cargo.lock

* tapelo

* feat: Implement equality = and inequality <> support for StringView (#10985)

* feat: Implement equality = and inequality <> support for StringView

* chore: Add tests for the StringView

* chore

* chore: Update tests for NULL

* fix: Used build_array_string!

* chore: Update string_coercion function to handle Utf8View type in binary.rs

* chore: add tests

* chore: ci

* Add more StringView comparison test coverage (#10997)

* Add more StringView comparison test coverage

* add reference

* Add another test showing casting on columns works correctly

* feat: Implement equality = and inequality <> support for BinaryView (#11004)

* feat: Implement equality = and inequality <> support for BinaryView

Signed-off-by: Chojan Shang <[email protected]>

* chore: make fmt happy

Signed-off-by: Chojan Shang <[email protected]>

---------

Signed-off-by: Chojan Shang <[email protected]>

* Implement support for LargeString and LargeBinary for StringView and BinaryView (#11034)

* implement large binary

* add tests for large string

* better comments for string coercion

* Improve filter predicates with `Utf8View` literals (#11043)

* refactor: Improve type coercion logic in TypeCoercionRewriter

* refactor: Improve type coercion logic in TypeCoercionRewriter

* chore

* chore: Update test

* refactor: Improve type coercion logic in TypeCoercionRewriter

* refactor: Remove unused import and update code formatting in unwrap_cast_in_comparison.rs

* Remove arrow-patch

---------

Signed-off-by: Chojan Shang <[email protected]>
Co-authored-by: Alex Huang <[email protected]>
Co-authored-by: Chojan Shang <[email protected]>
Co-authored-by: Xiangpeng Hao <[email protected]>
  • Loading branch information
4 people authored Jul 16, 2024
1 parent f11bdf0 commit ccb4baf
Show file tree
Hide file tree
Showing 5 changed files with 566 additions and 32 deletions.
8 changes: 2 additions & 6 deletions datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1682,8 +1682,10 @@ impl ScalarValue {
DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16),
DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32),
DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64),
DataType::Utf8View => build_array_string!(StringViewArray, Utf8View),
DataType::Utf8 => build_array_string!(StringArray, Utf8),
DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8),
DataType::BinaryView => build_array_string!(BinaryViewArray, BinaryView),
DataType::Binary => build_array_string!(BinaryArray, Binary),
DataType::LargeBinary => build_array_string!(LargeBinaryArray, LargeBinary),
DataType::Date32 => build_array_primitive!(Date32Array, Date32),
Expand Down Expand Up @@ -1841,8 +1843,6 @@ impl ScalarValue {
| DataType::Time64(TimeUnit::Millisecond)
| DataType::Map(_, _)
| DataType::RunEndEncoded(_, _)
| DataType::Utf8View
| DataType::BinaryView
| DataType::ListView(_)
| DataType::LargeListView(_) => {
return _internal_err!(
Expand Down Expand Up @@ -5695,16 +5695,12 @@ mod tests {
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
);

// needs https://github.com/apache/arrow-rs/issues/5893
/*
check_scalar_cast(ScalarValue::Utf8(None), DataType::Utf8View);
check_scalar_cast(ScalarValue::from("foo"), DataType::Utf8View);
check_scalar_cast(
ScalarValue::from("larger than 12 bytes string"),
DataType::Utf8View,
);
*/
}

// mimics how casting work on scalar values by `casting` `scalar` to `desired_type`
Expand Down
36 changes: 26 additions & 10 deletions datafusion/expr/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -919,16 +919,21 @@ fn string_concat_internal_coercion(
}
}

/// Coercion rules for string types (Utf8/LargeUtf8): If at least one argument is
/// a string type and both arguments can be coerced into a string type, coerce
/// to string type.
/// Coercion rules for string view types (Utf8/LargeUtf8/Utf8View):
/// If at least one argument is a string view, we coerce to string view
/// based on the observation that StringArray to StringViewArray is cheap but not vice versa.
///
/// Between Utf8 and LargeUtf8, we coerce to LargeUtf8.
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
// If Utf8View is in any side, we coerce to Utf8View.
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
Some(Utf8View)
}
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
(Utf8, Utf8) => Some(Utf8),
(LargeUtf8, Utf8) => Some(LargeUtf8),
(Utf8, LargeUtf8) => Some(LargeUtf8),
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
_ => None,
}
}
Expand Down Expand Up @@ -975,15 +980,26 @@ fn binary_to_string_coercion(
}
}

/// Coercion rules for binary types (Binary/LargeBinary): If at least one argument is
/// Coercion rules for binary types (Binary/LargeBinary/BinaryView): If at least one argument is
/// a binary type and both arguments can be coerced into a binary type, coerce
/// to binary type.
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(Binary | Utf8, Binary) | (Binary, Utf8) => Some(Binary),
(LargeBinary | Binary | Utf8 | LargeUtf8, LargeBinary)
| (LargeBinary, Binary | Utf8 | LargeUtf8) => Some(LargeBinary),
// If BinaryView is in any side, we coerce to BinaryView.
(BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
| (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => {
Some(BinaryView)
}
// Prefer LargeBinary over Binary
(LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
| (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary),

// If Utf8View/LargeUtf8 presents need to be large Binary
(Utf8View | LargeUtf8, Binary) | (Binary, Utf8View | LargeUtf8) => {
Some(LargeBinary)
}
(Binary, Utf8) | (Utf8, Binary) => Some(Binary),
_ => None,
}
}
Expand Down
26 changes: 10 additions & 16 deletions datafusion/optimizer/src/unwrap_cast_in_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
use datafusion_common::{internal_err, DFSchema, DFSchemaRef, Result, ScalarValue};
use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast};
use datafusion_expr::utils::merge_schema;
use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan, Operator};
use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan};

/// [`UnwrapCastInComparison`] attempts to remove casts from
/// comparisons to literals ([`ScalarValue`]s) by applying the casts
Expand Down Expand Up @@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
};
is_supported_type(&left_type)
&& is_supported_type(&right_type)
&& is_comparison_op(op)
&& op.is_comparison_operator()
} =>
{
match (left.as_mut(), right.as_mut()) {
Expand Down Expand Up @@ -262,18 +262,6 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
}
}

fn is_comparison_op(op: &Operator) -> bool {
matches!(
op,
Operator::Eq
| Operator::NotEq
| Operator::Gt
| Operator::GtEq
| Operator::Lt
| Operator::LtEq
)
}

/// Returns true if [UnwrapCastExprRewriter] supports this data type
fn is_supported_type(data_type: &DataType) -> bool {
is_supported_numeric_type(data_type)
Expand All @@ -300,7 +288,10 @@ fn is_supported_numeric_type(data_type: &DataType) -> bool {

/// Returns true if [UnwrapCastExprRewriter] supports casting this value as a string
fn is_supported_string_type(data_type: &DataType) -> bool {
matches!(data_type, DataType::Utf8 | DataType::LargeUtf8)
matches!(
data_type,
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
)
}

/// Returns true if [UnwrapCastExprRewriter] supports casting this value as a dictionary
Expand Down Expand Up @@ -473,12 +464,15 @@ fn try_cast_string_literal(
target_type: &DataType,
) -> Option<ScalarValue> {
let string_value = match lit_value {
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => s.clone(),
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) | ScalarValue::Utf8View(s) => {
s.clone()
}
_ => return None,
};
let scalar_value = match target_type {
DataType::Utf8 => ScalarValue::Utf8(string_value),
DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value),
DataType::Utf8View => ScalarValue::Utf8View(string_value),
_ => return None,
};
Some(scalar_value)
Expand Down
202 changes: 202 additions & 0 deletions datafusion/sqllogictest/test_files/binary_view.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

########
## Test setup
########

statement ok
create table test_source as values
('Andrew', 'X'),
('Xiangpeng', 'Xiangpeng'),
('Raphael', 'R'),
(NULL, 'R')
;

# Table with the different combination of column types
statement ok
CREATE TABLE test AS
SELECT
arrow_cast(column1, 'Utf8') as column1_utf8,
arrow_cast(column2, 'Utf8') as column2_utf8,
arrow_cast(column1, 'Binary') AS column1_binary,
arrow_cast(column2, 'Binary') AS column2_binary,
arrow_cast(column1, 'LargeBinary') AS column1_large_binary,
arrow_cast(column2, 'LargeBinary') AS column2_large_binary,
arrow_cast(arrow_cast(column1, 'Binary'), 'BinaryView') AS column1_binaryview,
arrow_cast(arrow_cast(column2, 'Binary'), 'BinaryView') AS column2_binaryview,
arrow_cast(column1, 'Dictionary(Int32, Binary)') AS column1_dict,
arrow_cast(column2, 'Dictionary(Int32, Binary)') AS column2_dict
FROM test_source;

statement ok
drop table test_source

########
## BinaryView to BinaryView
########

# BinaryView scalar to BinaryView scalar

query BBBB
SELECT
arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison1,
arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison2,
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison3,
arrow_cast(arrow_cast('Xiangpeng', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison4;
----
false true true true


# BinaryView column to BinaryView column comparison as filters

query TT
select column1_utf8, column2_utf8 from test where column1_binaryview = column2_binaryview;
----
Xiangpeng Xiangpeng

query TT
select column1_utf8, column2_utf8 from test where column1_binaryview <> column2_binaryview;
----
Andrew X
Raphael R

# BinaryView column to BinaryView column
query TTBB
select
column1_utf8, column2_utf8,
column1_binaryview = column2_binaryview,
column1_binaryview <> column2_binaryview
from test;
----
Andrew X false true
Xiangpeng Xiangpeng true false
Raphael R false true
NULL R NULL NULL

# BinaryView column to BinaryView scalar comparison
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binaryview,
column1_binaryview <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binaryview
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

########
## BinaryView to Binary
########

# test BinaryViewArray with Binary columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = column2_binary,
column2_binary = column1_binaryview,
column1_binaryview <> column2_binary,
column2_binary <> column1_binaryview
from test;
----
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# test BinaryViewArray with LargeBinary columns
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = column2_large_binary,
column2_large_binary = column1_binaryview,
column1_binaryview <> column2_large_binary,
column2_large_binary <> column1_binaryview
from test;
----
Andrew X false false true true
Xiangpeng Xiangpeng true true false false
Raphael R false false true true
NULL R NULL NULL NULL NULL

# BinaryView column to Binary scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = arrow_cast('Andrew', 'Binary'),
arrow_cast('Andrew', 'Binary') = column1_binaryview,
column1_binaryview <> arrow_cast('Andrew', 'Binary'),
arrow_cast('Andrew', 'Binary') <> column1_binaryview
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# BinaryView column to LargeBinary scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binaryview = arrow_cast('Andrew', 'LargeBinary'),
arrow_cast('Andrew', 'LargeBinary') = column1_binaryview,
column1_binaryview <> arrow_cast('Andrew', 'LargeBinary'),
arrow_cast('Andrew', 'LargeBinary') <> column1_binaryview
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

# Binary column to BinaryView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binary,
column1_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binary
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL


# LargeBinary column to BinaryView scalar
query TTBBBB
select
column1_utf8, column2_utf8,
column1_large_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_large_binary,
column1_large_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_large_binary
from test;
----
Andrew X true true false false
Xiangpeng Xiangpeng false false true true
Raphael R false false true true
NULL R NULL NULL NULL NULL

statement ok
drop table test;
Loading

0 comments on commit ccb4baf

Please sign in to comment.