diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala index 4b2bbbdd84..1a25cd2802 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala @@ -106,4 +106,8 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { testStddevSamp(true) testCovarPop() testCovarSamp() + testRegrIntercept() + testRegrSlope() + testRegrR2() + testRegrSXY() } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala index 8bc79a244e..5de7608918 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala @@ -111,4 +111,8 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTes testCovarPop() testCovarSamp() testCorr() + testRegrIntercept() + testRegrSlope() + testRegrR2() + testRegrSXY() } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala index 77ace3f3f4..1ff7527c97 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala @@ -104,4 +104,12 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCT testCovarSamp(true) testCorr() testCorr(true) + testRegrIntercept() + testRegrIntercept(true) + testRegrSlope() + testRegrSlope(true) + testRegrR2() + testRegrR2(true) + testRegrSXY() + testRegrSXY(true) } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala index 0f85bd534c..543c8465ed 100644 --- a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala @@ -406,9 +406,11 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu protected def caseConvert(tableName: String): String = tableName + private def withOrWithout(isDistinct: Boolean): String = if (isDistinct) "with" else "without" + protected def testVarPop(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: VAR_POP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: VAR_POP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql(s"SELECT VAR_POP(${distinct}bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") checkFilterPushed(df) @@ -416,15 +418,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "VAR_POP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 10000d) - assert(row(1).getDouble(0) === 2500d) - assert(row(2).getDouble(0) === 0d) + assert(row(0).getDouble(0) === 10000.0) + assert(row(1).getDouble(0) === 2500.0) + assert(row(2).getDouble(0) === 0.0) } } protected def testVarSamp(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: VAR_SAMP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: VAR_SAMP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT VAR_SAMP(${distinct}bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -433,15 +435,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "VAR_SAMP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 20000d) - assert(row(1).getDouble(0) === 5000d) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) assert(row(2).isNullAt(0)) } } protected def testStddevPop(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: STDDEV_POP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: STDDEV_POP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT STDDEV_POP(${distinct}bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -450,15 +452,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "STDDEV_POP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 100d) - assert(row(1).getDouble(0) === 50d) - assert(row(2).getDouble(0) === 0d) + assert(row(0).getDouble(0) === 100.0) + assert(row(1).getDouble(0) === 50.0) + assert(row(2).getDouble(0) === 0.0) } } protected def testStddevSamp(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: STDDEV_SAMP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: STDDEV_SAMP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT STDDEV_SAMP(${distinct}bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -467,15 +469,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "STDDEV_SAMP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 141.4213562373095d) - assert(row(1).getDouble(0) === 70.71067811865476d) + assert(row(0).getDouble(0) === 141.4213562373095) + assert(row(1).getDouble(0) === 70.71067811865476) assert(row(2).isNullAt(0)) } } protected def testCovarPop(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: COVAR_POP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: COVAR_POP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT COVAR_POP(${distinct}bonus, bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -484,15 +486,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "COVAR_POP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 10000d) - assert(row(1).getDouble(0) === 2500d) - assert(row(2).getDouble(0) === 0d) + assert(row(0).getDouble(0) === 10000.0) + assert(row(1).getDouble(0) === 2500.0) + assert(row(2).getDouble(0) === 0.0) } } protected def testCovarSamp(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: COVAR_SAMP with distinct: $isDistinct") { + test(s"scan with aggregate push-down: COVAR_SAMP ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT COVAR_SAMP(${distinct}bonus, bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -501,15 +503,15 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "COVAR_SAMP") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 20000d) - assert(row(1).getDouble(0) === 5000d) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) assert(row(2).isNullAt(0)) } } protected def testCorr(isDistinct: Boolean = false): Unit = { val distinct = if (isDistinct) "DISTINCT " else "" - test(s"scan with aggregate push-down: CORR with distinct: $isDistinct") { + test(s"scan with aggregate push-down: CORR ${withOrWithout(isDistinct)} DISTINCT") { val df = sql( s"SELECT CORR(${distinct}bonus, bonus) FROM $catalogAndNamespace." + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") @@ -518,9 +520,77 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu checkAggregatePushed(df, "CORR") val row = df.collect() assert(row.length === 3) - assert(row(0).getDouble(0) === 1d) - assert(row(1).getDouble(0) === 1d) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrIntercept(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_INTERCEPT ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_INTERCEPT(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_INTERCEPT") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 0.0) + assert(row(1).getDouble(0) === 0.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrSlope(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_SLOPE ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_SLOPE(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_SLOPE") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrR2(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_R2 ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_R2(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_R2") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) assert(row(2).isNullAt(0)) } } + + protected def testRegrSXY(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_SXY ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_SXY(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_SXY") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) + assert(row(2).getDouble(0) === 0.0) + } + } } diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3 index ccc0d60701..af1db05ff4 100644 --- a/dev/deps/spark-deps-hadoop-2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2-hive-2.3 @@ -39,7 +39,7 @@ commons-cli/1.5.0//commons-cli-1.5.0.jar commons-codec/1.15//commons-codec-1.15.jar commons-collections/3.2.2//commons-collections-3.2.2.jar commons-collections4/4.4//commons-collections4-4.4.jar -commons-compiler/3.0.16//commons-compiler-3.0.16.jar +commons-compiler/3.1.7//commons-compiler-3.1.7.jar commons-compress/1.21//commons-compress-1.21.jar commons-configuration/1.6//commons-configuration-1.6.jar commons-crypto/1.1.0//commons-crypto-1.1.0.jar @@ -128,7 +128,7 @@ jakarta.servlet-api/4.0.3//jakarta.servlet-api-4.0.3.jar jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar -janino/3.0.16//janino-3.0.16.jar +janino/3.1.7//janino-3.1.7.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar javax.inject/1//javax.inject-1.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 31ada151e2..2864498928 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -40,7 +40,7 @@ commons-cli/1.5.0//commons-cli-1.5.0.jar commons-codec/1.15//commons-codec-1.15.jar commons-collections/3.2.2//commons-collections-3.2.2.jar commons-collections4/4.4//commons-collections4-4.4.jar -commons-compiler/3.0.16//commons-compiler-3.0.16.jar +commons-compiler/3.1.7//commons-compiler-3.1.7.jar commons-compress/1.21//commons-compress-1.21.jar commons-crypto/1.1.0//commons-crypto-1.1.0.jar commons-dbcp/1.4//commons-dbcp-1.4.jar @@ -116,7 +116,7 @@ jakarta.servlet-api/4.0.3//jakarta.servlet-api-4.0.3.jar jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar -janino/3.0.16//janino-3.0.16.jar +janino/3.1.7//janino-3.1.7.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar javolution/5.5.1//javolution-5.5.1.jar diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 8b06965332..d776410fd2 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -647,6 +647,7 @@ def __hash__(self): "pyspark.pandas.tests.test_resample", "pyspark.pandas.tests.test_reshape", "pyspark.pandas.tests.test_rolling", + "pyspark.pandas.tests.test_scalars", "pyspark.pandas.tests.test_series_conversion", "pyspark.pandas.tests.test_series_datetime", "pyspark.pandas.tests.test_series_string", diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index bb55cec52f..6ad8210ed7 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -319,6 +319,7 @@ When ANSI mode is on, it throws exceptions for invalid operations. You can use t - `try_sum`: identical to the function `sum`, except that it returns `NULL` result instead of throwing an exception on integral/decimal/interval value overflow. - `try_avg`: identical to the function `avg`, except that it returns `NULL` result instead of throwing an exception on decimal/interval value overflow. - `try_element_at`: identical to the function `element_at`, except that it returns `NULL` result instead of throwing an exception on array's index out of bound or map's key not found. + - `try_to_timestamp`: identical to the function `to_timestamp`, except that it returns `NULL` result instead of throwing an exception on string parsing error. ### SQL Keywords (optional, disabled by default) diff --git a/pom.xml b/pom.xml index 1976ea9db9..ffd2baae8a 100644 --- a/pom.xml +++ b/pom.xml @@ -189,7 +189,7 @@ 2.11.1 4.1.17 14.0.1 - 3.0.16 + 3.1.7 2.35 2.10.14 3.5.2 diff --git a/python/docs/source/reference/pyspark.sql/data_types.rst b/python/docs/source/reference/pyspark.sql/data_types.rst index d146c64047..775f0bf394 100644 --- a/python/docs/source/reference/pyspark.sql/data_types.rst +++ b/python/docs/source/reference/pyspark.sql/data_types.rst @@ -40,6 +40,7 @@ Data Types NullType ShortType StringType + VarcharType StructField StructType TimestampType diff --git a/python/pyspark/pandas/__init__.py b/python/pyspark/pandas/__init__.py index e367ef5e25..518326c0c5 100644 --- a/python/pyspark/pandas/__init__.py +++ b/python/pyspark/pandas/__init__.py @@ -27,6 +27,7 @@ from typing import Any from pyspark.pandas.missing.general_functions import _MissingPandasLikeGeneralFunctions +from pyspark.pandas.missing.scalars import _MissingPandasLikeScalars from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version try: @@ -158,6 +159,8 @@ def _auto_patch_pandas() -> None: def __getattr__(key: str) -> Any: if key.startswith("__"): raise AttributeError(key) + if hasattr(_MissingPandasLikeScalars, key): + raise getattr(_MissingPandasLikeScalars, key) if hasattr(_MissingPandasLikeGeneralFunctions, key): return getattr(_MissingPandasLikeGeneralFunctions, key) else: diff --git a/python/pyspark/pandas/exceptions.py b/python/pyspark/pandas/exceptions.py index 829c753769..d93f0bf0b6 100644 --- a/python/pyspark/pandas/exceptions.py +++ b/python/pyspark/pandas/exceptions.py @@ -69,10 +69,13 @@ def __init__( method_name: Optional[str] = None, arg_name: Optional[str] = None, property_name: Optional[str] = None, + scalar_name: Optional[str] = None, deprecated: bool = False, reason: str = "", ): - assert (method_name is None) != (property_name is None) + assert [method_name is not None, property_name is not None, scalar_name is not None].count( + True + ) == 1 self.class_name = class_name self.method_name = method_name self.arg_name = arg_name @@ -95,6 +98,11 @@ def __init__( msg = "The method `{0}.{1}()` is not implemented{2}".format( class_name, method_name, reason ) + elif scalar_name is not None: + msg = ( + "The scalar `{0}.{1}` is not reimplemented in pyspark.pandas;" + " use `pd.{1}`.".format(class_name, scalar_name) + ) else: if deprecated: msg = ( diff --git a/python/pyspark/pandas/missing/scalars.py b/python/pyspark/pandas/missing/scalars.py new file mode 100644 index 0000000000..a9c0277be0 --- /dev/null +++ b/python/pyspark/pandas/missing/scalars.py @@ -0,0 +1,29 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from pyspark.pandas.exceptions import PandasNotImplementedError + + +def _unsupported_scalar(scalar_name): + return PandasNotImplementedError(class_name="ps", scalar_name=scalar_name) + + +class _MissingPandasLikeScalars: + Timestamp = _unsupported_scalar("Timestamp") + Timedelta = _unsupported_scalar("Timedelta") + Period = _unsupported_scalar("Period") + Interval = _unsupported_scalar("Interval") + Categorical = _unsupported_scalar("Categorical") diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index 019ed0ce25..df4e11ebd8 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -27,6 +27,7 @@ import pyspark.pandas as ps import pyspark.pandas.groupby as psg import pyspark.pandas.window as psw +from pyspark.pandas.exceptions import PandasNotImplementedError import pandas as pd import pandas.core.groupby as pdg @@ -131,7 +132,7 @@ def _create_supported_by_module( pd_module = getattr(pd_module_group, module_name) if module_name else pd_module_group try: ps_module = getattr(ps_module_group, module_name) if module_name else ps_module_group - except AttributeError: + except (AttributeError, PandasNotImplementedError): # module not implemented return {} @@ -262,7 +263,7 @@ def _transform_missing( def _get_pd_modules(pd_module_group: Any) -> List[str]: """ - Returns sorted pandas memeber list from pandas module path. + Returns sorted pandas member list from pandas module path. Parameters ---------- diff --git a/python/pyspark/pandas/tests/test_scalars.py b/python/pyspark/pandas/tests/test_scalars.py new file mode 100644 index 0000000000..47efde46b2 --- /dev/null +++ b/python/pyspark/pandas/tests/test_scalars.py @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import inspect + +import pyspark.pandas as ps +from pyspark.pandas.exceptions import PandasNotImplementedError +from pyspark.pandas.missing.scalars import _MissingPandasLikeScalars +from pyspark.testing.pandasutils import PandasOnSparkTestCase + + +class ScalarTest(PandasOnSparkTestCase): + def test_missing(self): + missing_scalars = inspect.getmembers(_MissingPandasLikeScalars) + + missing_scalars = [ + name + for (name, type_) in missing_scalars + if isinstance(type_, PandasNotImplementedError) + ] + + for scalar_name in missing_scalars: + with self.assertRaisesRegex( + PandasNotImplementedError, + "The scalar `ps.{0}` is not reimplemented in pyspark.pandas;" + " use `pd.{0}`.".format(scalar_name), + ): + getattr(ps, scalar_name) + + +if __name__ == "__main__": + import unittest + from pyspark.pandas.tests.test_scalars import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index ef0ad82dbb..218cfc413d 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -38,6 +38,7 @@ DayTimeIntervalType, MapType, StringType, + VarcharType, StructType, StructField, ArrayType, @@ -739,8 +740,12 @@ def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string for k, t in _all_atomic_types.items(): - self.assertEqual(t(), _parse_datatype_string(k)) + if k != "varchar": + self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) + self.assertEqual(VarcharType(1), _parse_datatype_string("varchar(1)")) + self.assertEqual(VarcharType(10), _parse_datatype_string("varchar( 10 )")) + self.assertEqual(VarcharType(11), _parse_datatype_string("varchar( 11)")) self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) @@ -1028,6 +1033,7 @@ def test_repr(self): instances = [ NullType(), StringType(), + VarcharType(10), BinaryType(), BooleanType(), DateType(), @@ -1132,6 +1138,15 @@ def test_decimal_type(self): t3 = DecimalType(8) self.assertNotEqual(t2, t3) + def test_varchar_type(self): + v1 = VarcharType(10) + v2 = VarcharType(20) + self.assertTrue(v2 is not v1) + self.assertNotEqual(v1, v2) + v3 = VarcharType(10) + self.assertEqual(v1, v3) + self.assertFalse(v1 is v3) + # regression test for SPARK-10392 def test_datetype_equal_zero(self): dt = DateType() @@ -1211,6 +1226,13 @@ def __init__(self, **kwargs): (1.0, StringType()), ([], StringType()), ({}, StringType()), + # Varchar + ("", VarcharType(10)), + ("", VarcharType(10)), + (1, VarcharType(10)), + (1.0, VarcharType(10)), + ([], VarcharType(10)), + ({}, VarcharType(10)), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean @@ -1267,6 +1289,8 @@ def __init__(self, **kwargs): failure_spec = [ # String (match anything but None) (None, StringType(), ValueError), + # VarcharType (match anything but None) + (None, VarcharType(10), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index fa3f3dd7d8..7ab8f7c9c2 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -57,6 +57,7 @@ "DataType", "NullType", "StringType", + "VarcharType", "BinaryType", "BooleanType", "DateType", @@ -181,6 +182,28 @@ class StringType(AtomicType, metaclass=DataTypeSingleton): pass +class VarcharType(AtomicType): + """Varchar data type + + Parameters + ---------- + length : int + the length limitation. + """ + + def __init__(self, length: int): + self.length = length + + def simpleString(self) -> str: + return "varchar(%d)" % (self.length) + + def jsonValue(self) -> str: + return "varchar(%d)" % (self.length) + + def __repr__(self) -> str: + return "VarcharType(%d)" % (self.length) + + class BinaryType(AtomicType, metaclass=DataTypeSingleton): """Binary (byte array) data type.""" @@ -625,6 +648,10 @@ class StructType(DataType): >>> struct2 = StructType([StructField("f1", StringType(), True)]) >>> struct1 == struct2 True + >>> struct1 = StructType([StructField("f1", VarcharType(10), True)]) + >>> struct2 = StructType([StructField("f1", VarcharType(10), True)]) + >>> struct1 == struct2 + True >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct2 = StructType([StructField("f1", StringType(), True), ... StructField("f2", IntegerType(), False)]) @@ -944,6 +971,7 @@ def __eq__(self, other: Any) -> bool: _atomic_types: List[Type[DataType]] = [ StringType, + VarcharType, BinaryType, BooleanType, DecimalType, @@ -965,7 +993,7 @@ def __eq__(self, other: Any) -> bool: (v.typeName(), v) for v in _complex_types ) - +_LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)") _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)") _INTERVAL_DAYTIME = re.compile(r"interval (day|hour|minute|second)( to (day|hour|minute|second))?") @@ -987,6 +1015,8 @@ def _parse_datatype_string(s: str) -> DataType: StructType([StructField('a', ByteType(), True), StructField('b', DecimalType(16,8), True)]) >>> _parse_datatype_string("a DOUBLE, b STRING") StructType([StructField('a', DoubleType(), True), StructField('b', StringType(), True)]) + >>> _parse_datatype_string("a DOUBLE, b VARCHAR( 50 )") + StructType([StructField('a', DoubleType(), True), StructField('b', VarcharType(50), True)]) >>> _parse_datatype_string("a: array< short>") StructType([StructField('a', ArrayType(ShortType(), True), True)]) >>> _parse_datatype_string(" map ") @@ -1055,7 +1085,10 @@ def _parse_datatype_json_string(json_string: str) -> DataType: ... python_datatype = _parse_datatype_json_string(scala_datatype.json()) ... assert datatype == python_datatype >>> for cls in _all_atomic_types.values(): - ... check_datatype(cls()) + ... if cls is not VarcharType: + ... check_datatype(cls()) + ... else: + ... check_datatype(cls(1)) >>> # Simple ArrayType. >>> simple_arraytype = ArrayType(StringType(), True) @@ -1079,6 +1112,7 @@ def _parse_datatype_json_string(json_string: str) -> DataType: ... StructField("simpleMap", simple_maptype, True), ... StructField("simpleStruct", simple_structtype, True), ... StructField("boolean", BooleanType(), False), + ... StructField("words", VarcharType(10), False), ... StructField("withMeta", DoubleType(), False, {"name": "age"})]) >>> check_datatype(complex_structtype) @@ -1111,6 +1145,9 @@ def _parse_datatype_json_value(json_value: Union[dict, str]) -> DataType: if first_field is not None and second_field is None: return DayTimeIntervalType(first_field) return DayTimeIntervalType(first_field, second_field) + elif _LENGTH_VARCHAR.match(json_value): + m = _LENGTH_VARCHAR.match(json_value) + return VarcharType(int(m.group(1))) # type: ignore[union-attr] else: raise ValueError("Could not parse datatype: %s" % json_value) else: @@ -1549,6 +1586,7 @@ def convert_struct(obj: Any) -> Optional[Tuple]: DoubleType: (float,), DecimalType: (decimal.Decimal,), StringType: (str,), + VarcharType: (str,), BinaryType: (bytearray, bytes), DateType: (datetime.date, datetime.datetime), TimestampType: (datetime.datetime,), @@ -1659,8 +1697,8 @@ def verify_acceptable_types(obj: Any) -> None: new_msg("%s can not accept object %r in type %s" % (dataType, obj, type(obj))) ) - if isinstance(dataType, StringType): - # StringType can work with any types + if isinstance(dataType, (StringType, VarcharType)): + # StringType and VarcharType can work with any types def verify_value(obj: Any) -> None: pass diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d97b344d16..858f2841dc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -456,6 +456,7 @@ object FunctionRegistry { expression[TryAverage]("try_avg"), expression[TrySum]("try_sum"), expression[TryToBinary]("try_to_binary"), + expressionBuilder("try_to_timestamp", TryToTimestampExpressionBuilder, setAlias = true), // aggregate functions expression[HyperLogLogPlusPlus]("approx_count_distinct"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 592783a54e..3f3e9f75cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.catalyst.expressions.codegen import java.io.ByteArrayInputStream -import java.util.{Map => JavaMap} import scala.annotation.tailrec import scala.collection.JavaConverters._ @@ -28,8 +27,8 @@ import scala.util.control.NonFatal import com.google.common.cache.{CacheBuilder, CacheLoader} import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} -import org.codehaus.commons.compiler.CompileException -import org.codehaus.janino.{ByteArrayClassLoader, ClassBodyEvaluator, InternalCompilerException, SimpleCompiler} +import org.codehaus.commons.compiler.{CompileException, InternalCompilerException} +import org.codehaus.janino.ClassBodyEvaluator import org.codehaus.janino.util.ClassFile import org.apache.spark.{TaskContext, TaskKilledException} @@ -1524,14 +1523,7 @@ object CodeGenerator extends Logging { */ private def updateAndGetCompilationStats(evaluator: ClassBodyEvaluator): ByteCodeStats = { // First retrieve the generated classes. - val classes = { - val resultField = classOf[SimpleCompiler].getDeclaredField("result") - resultField.setAccessible(true) - val loader = resultField.get(evaluator).asInstanceOf[ByteArrayClassLoader] - val classesField = loader.getClass.getDeclaredField("classes") - classesField.setAccessible(true) - classesField.get(loader).asInstanceOf[JavaMap[String, Array[Byte]]].asScala - } + val classes = evaluator.getBytecodes.asScala // Then walk the classes to get at the method bytecode. val codeAttr = Utils.classForName("org.codehaus.janino.util.ClassFile$CodeAttribute") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index c965d50eab..98b23b1c6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1161,6 +1161,50 @@ object ParseToTimestampLTZExpressionBuilder extends ExpressionBuilder { } } +/** + * * Parses a column to a timestamp based on the supplied format. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression + to a timestamp. The function always returns null on an invalid input with/without ANSI SQL + mode enabled. By default, it follows casting rules to a timestamp if the `fmt` is omitted. + The result data type is consistent with the value of configuration `spark.sql.timestampType`. + """, + arguments = """ + Arguments: + * timestamp_str - A string to be parsed to timestamp. + * fmt - Timestamp format pattern to follow. See Datetime Patterns for valid + date and time format patterns. + """, + examples = """ + Examples: + > SELECT _FUNC_('2016-12-31 00:12:00'); + 2016-12-31 00:12:00 + > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd'); + 2016-12-31 00:00:00 + > SELECT _FUNC_('foo', 'yyyy-MM-dd'); + NULL + """, + group = "datetime_funcs", + since = "3.4.0") +// scalastyle:on line.size.limit +object TryToTimestampExpressionBuilder extends ExpressionBuilder { + override def build(funcName: String, expressions: Seq[Expression]): Expression = { + val numArgs = expressions.length + if (numArgs == 1 || numArgs == 2) { + ParseToTimestamp( + expressions.head, + expressions.drop(1).lastOption, + SQLConf.get.timestampType, + failOnError = false) + } else { + throw QueryCompilationErrors.invalidFunctionArgumentNumberError(Seq(1, 2), funcName, numArgs) + } + } +} + abstract class ToTimestamp extends BinaryExpression with TimestampFormatterHelper with ExpectsInputTypes { @@ -2048,12 +2092,13 @@ case class ParseToTimestamp( left: Expression, format: Option[Expression], override val dataType: DataType, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends RuntimeReplaceable with ImplicitCastInputTypes with TimeZoneAwareExpression { override lazy val replacement: Expression = format.map { f => - GetTimestamp(left, f, dataType, timeZoneId) - }.getOrElse(Cast(left, dataType, timeZoneId)) + GetTimestamp(left, f, dataType, timeZoneId, failOnError = failOnError) + }.getOrElse(Cast(left, dataType, timeZoneId, ansiEnabled = failOnError)) def this(left: Expression, format: Expression) = { this(left, Option(format), SQLConf.get.timestampType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 4c030a8ac0..64e6283c0e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -30,8 +30,7 @@ import java.util.concurrent.TimeoutException import com.fasterxml.jackson.core.{JsonParser, JsonToken} import org.apache.hadoop.fs.{FileAlreadyExistsException, FileStatus, Path} import org.apache.hadoop.fs.permission.FsPermission -import org.codehaus.commons.compiler.CompileException -import org.codehaus.janino.InternalCompilerException +import org.codehaus.commons.compiler.{CompileException, InternalCompilerException} import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkUnsupportedOperationException, SparkUpgradeException} import org.apache.spark.executor.CommitDeniedException diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 1e4499a0ee..737fcb1bad 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -568,7 +568,6 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { assert(refTerm.contains("scala.math.LowPriorityOrderingImplicits$$anon$")) } - // TODO (SPARK-35579): Fix this bug in janino and upgrade janino in Spark. test("SPARK-35578: final local variable bug in janino") { val code = """ diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt index d30b3327ff..5d8687779d 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk11-results.txt @@ -3,166 +3,166 @@ Pushdown for many distinct value case ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10439 10509 98 1.5 663.7 1.0X -Parquet Vectorized (Pushdown) 578 602 31 27.2 36.7 18.1X -Native ORC Vectorized 7076 7112 44 2.2 449.9 1.5X -Native ORC Vectorized (Pushdown) 487 496 7 32.3 31.0 21.4X +Parquet Vectorized 10211 10371 147 1.5 649.2 1.0X +Parquet Vectorized (Pushdown) 547 579 18 28.8 34.8 18.7X +Native ORC Vectorized 6910 6988 64 2.3 439.3 1.5X +Native ORC Vectorized (Pushdown) 522 531 8 30.1 33.2 19.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10580 10604 17 1.5 672.7 1.0X -Parquet Vectorized (Pushdown) 559 571 8 28.1 35.6 18.9X -Native ORC Vectorized 7188 7208 21 2.2 457.0 1.5X -Native ORC Vectorized (Pushdown) 473 481 9 33.3 30.0 22.4X +Parquet Vectorized 10286 10420 101 1.5 653.9 1.0X +Parquet Vectorized (Pushdown) 543 563 16 29.0 34.5 18.9X +Native ORC Vectorized 6972 7046 73 2.3 443.3 1.5X +Native ORC Vectorized (Pushdown) 489 512 15 32.1 31.1 21.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10526 10540 20 1.5 669.2 1.0X -Parquet Vectorized (Pushdown) 545 554 9 28.9 34.6 19.3X -Native ORC Vectorized 7144 7189 33 2.2 454.2 1.5X -Native ORC Vectorized (Pushdown) 448 462 8 35.1 28.5 23.5X +Parquet Vectorized 10242 10359 122 1.5 651.2 1.0X +Parquet Vectorized (Pushdown) 540 550 7 29.1 34.3 19.0X +Native ORC Vectorized 6892 7029 112 2.3 438.2 1.5X +Native ORC Vectorized (Pushdown) 454 483 24 34.7 28.8 22.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10506 10515 9 1.5 667.9 1.0X -Parquet Vectorized (Pushdown) 529 537 10 29.8 33.6 19.9X -Native ORC Vectorized 7118 7156 23 2.2 452.6 1.5X -Native ORC Vectorized (Pushdown) 436 445 10 36.0 27.7 24.1X +Parquet Vectorized 10342 10448 69 1.5 657.5 1.0X +Parquet Vectorized (Pushdown) 505 521 11 31.1 32.1 20.5X +Native ORC Vectorized 6854 6949 60 2.3 435.8 1.5X +Native ORC Vectorized (Pushdown) 455 474 26 34.6 28.9 22.7X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10505 10519 12 1.5 667.9 1.0X -Parquet Vectorized (Pushdown) 517 528 8 30.4 32.9 20.3X -Native ORC Vectorized 7232 7271 43 2.2 459.8 1.5X -Native ORC Vectorized (Pushdown) 439 448 7 35.8 27.9 23.9X +Parquet Vectorized 10221 10296 43 1.5 649.9 1.0X +Parquet Vectorized (Pushdown) 503 526 20 31.3 32.0 20.3X +Native ORC Vectorized 6917 6972 93 2.3 439.8 1.5X +Native ORC Vectorized (Pushdown) 437 467 32 36.0 27.8 23.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 19062 19147 82 0.8 1211.9 1.0X -Parquet Vectorized (Pushdown) 19151 19167 13 0.8 1217.6 1.0X -Native ORC Vectorized 15455 15470 15 1.0 982.6 1.2X -Native ORC Vectorized (Pushdown) 15605 15614 9 1.0 992.2 1.2X +Parquet Vectorized 17553 17696 115 0.9 1116.0 1.0X +Parquet Vectorized (Pushdown) 17779 17897 117 0.9 1130.4 1.0X +Native ORC Vectorized 13923 14179 173 1.1 885.2 1.3X +Native ORC Vectorized (Pushdown) 14119 14346 151 1.1 897.7 1.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9694 9721 30 1.6 616.3 1.0X -Parquet Vectorized (Pushdown) 475 484 10 33.1 30.2 20.4X -Native ORC Vectorized 6570 6598 32 2.4 417.7 1.5X -Native ORC Vectorized (Pushdown) 413 421 7 38.1 26.3 23.5X +Parquet Vectorized 9466 9599 109 1.7 601.8 1.0X +Parquet Vectorized (Pushdown) 468 485 14 33.6 29.8 20.2X +Native ORC Vectorized 6399 6545 98 2.5 406.8 1.5X +Native ORC Vectorized (Pushdown) 418 432 15 37.7 26.6 22.7X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9684 9695 15 1.6 615.7 1.0X -Parquet Vectorized (Pushdown) 494 501 7 31.8 31.4 19.6X -Native ORC Vectorized 6590 6607 23 2.4 419.0 1.5X -Native ORC Vectorized (Pushdown) 418 432 11 37.6 26.6 23.2X +Parquet Vectorized 9384 9491 60 1.7 596.6 1.0X +Parquet Vectorized (Pushdown) 478 509 23 32.9 30.4 19.6X +Native ORC Vectorized 6197 6328 112 2.5 394.0 1.5X +Native ORC Vectorized (Pushdown) 417 438 14 37.7 26.5 22.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9720 9736 24 1.6 618.0 1.0X -Parquet Vectorized (Pushdown) 489 498 8 32.2 31.1 19.9X -Native ORC Vectorized 6665 6680 12 2.4 423.8 1.5X -Native ORC Vectorized (Pushdown) 423 427 6 37.2 26.9 23.0X +Parquet Vectorized 9538 9619 125 1.6 606.4 1.0X +Parquet Vectorized (Pushdown) 486 494 12 32.3 30.9 19.6X +Native ORC Vectorized 6321 6433 69 2.5 401.9 1.5X +Native ORC Vectorized (Pushdown) 432 456 17 36.4 27.5 22.1X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9728 9738 15 1.6 618.5 1.0X -Parquet Vectorized (Pushdown) 484 492 9 32.5 30.8 20.1X -Native ORC Vectorized 6672 6685 12 2.4 424.2 1.5X -Native ORC Vectorized (Pushdown) 418 426 8 37.6 26.6 23.3X +Parquet Vectorized 9667 9731 54 1.6 614.6 1.0X +Parquet Vectorized (Pushdown) 460 492 31 34.2 29.2 21.0X +Native ORC Vectorized 6262 6354 59 2.5 398.1 1.5X +Native ORC Vectorized (Pushdown) 433 460 19 36.3 27.5 22.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9732 9741 6 1.6 618.8 1.0X -Parquet Vectorized (Pushdown) 489 494 3 32.2 31.1 19.9X -Native ORC Vectorized 6667 6682 15 2.4 423.9 1.5X -Native ORC Vectorized (Pushdown) 419 426 6 37.6 26.6 23.2X +Parquet Vectorized 9726 9776 37 1.6 618.4 1.0X +Parquet Vectorized (Pushdown) 496 509 12 31.7 31.5 19.6X +Native ORC Vectorized 6157 6171 10 2.6 391.4 1.6X +Native ORC Vectorized (Pushdown) 417 426 10 37.7 26.5 23.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9733 9739 9 1.6 618.8 1.0X -Parquet Vectorized (Pushdown) 484 515 46 32.5 30.8 20.1X -Native ORC Vectorized 6563 6578 15 2.4 417.2 1.5X -Native ORC Vectorized (Pushdown) 422 428 6 37.2 26.9 23.0X +Parquet Vectorized 9061 9067 6 1.7 576.1 1.0X +Parquet Vectorized (Pushdown) 463 473 10 34.0 29.4 19.6X +Native ORC Vectorized 6135 6273 250 2.6 390.0 1.5X +Native ORC Vectorized (Pushdown) 419 429 9 37.5 26.7 21.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10574 10600 33 1.5 672.3 1.0X -Parquet Vectorized (Pushdown) 2263 2273 14 7.0 143.9 4.7X -Native ORC Vectorized 7420 7462 24 2.1 471.8 1.4X -Native ORC Vectorized (Pushdown) 1866 1876 8 8.4 118.6 5.7X +Parquet Vectorized 9768 9811 37 1.6 621.0 1.0X +Parquet Vectorized (Pushdown) 2046 2073 28 7.7 130.1 4.8X +Native ORC Vectorized 6796 6836 28 2.3 432.1 1.4X +Native ORC Vectorized (Pushdown) 1701 1717 16 9.2 108.1 5.7X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 13597 13631 26 1.2 864.4 1.0X -Parquet Vectorized (Pushdown) 9042 9050 7 1.7 574.9 1.5X -Native ORC Vectorized 10326 10368 45 1.5 656.5 1.3X -Native ORC Vectorized (Pushdown) 7330 7345 16 2.1 466.0 1.9X +Parquet Vectorized 12293 12318 24 1.3 781.6 1.0X +Parquet Vectorized (Pushdown) 8048 8063 16 2.0 511.7 1.5X +Native ORC Vectorized 9291 9306 10 1.7 590.7 1.3X +Native ORC Vectorized (Pushdown) 6501 6524 19 2.4 413.3 1.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16661 16679 13 0.9 1059.3 1.0X -Parquet Vectorized (Pushdown) 15807 15821 17 1.0 1005.0 1.1X -Native ORC Vectorized 13294 13314 34 1.2 845.2 1.3X -Native ORC Vectorized (Pushdown) 12779 12788 9 1.2 812.5 1.3X +Parquet Vectorized 14801 14824 21 1.1 941.0 1.0X +Parquet Vectorized (Pushdown) 13992 14018 35 1.1 889.6 1.1X +Native ORC Vectorized 11720 11755 27 1.3 745.1 1.3X +Native ORC Vectorized (Pushdown) 11304 11349 34 1.4 718.7 1.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 17368 17394 23 0.9 1104.2 1.0X -Parquet Vectorized (Pushdown) 17456 17466 8 0.9 1109.8 1.0X -Native ORC Vectorized 13925 13947 24 1.1 885.3 1.2X -Native ORC Vectorized (Pushdown) 14073 14084 10 1.1 894.8 1.2X +Parquet Vectorized 15438 15464 18 1.0 981.5 1.0X +Parquet Vectorized (Pushdown) 15523 15566 43 1.0 986.9 1.0X +Native ORC Vectorized 12376 12393 14 1.3 786.8 1.2X +Native ORC Vectorized (Pushdown) 12522 12539 15 1.3 796.1 1.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 17396 17408 9 0.9 1106.0 1.0X -Parquet Vectorized (Pushdown) 17443 17458 12 0.9 1109.0 1.0X -Native ORC Vectorized 13993 13999 5 1.1 889.6 1.2X -Native ORC Vectorized (Pushdown) 14129 14136 7 1.1 898.3 1.2X +Parquet Vectorized 16275 16402 100 1.0 1034.7 1.0X +Parquet Vectorized (Pushdown) 16314 16375 68 1.0 1037.2 1.0X +Native ORC Vectorized 12778 12997 178 1.2 812.4 1.3X +Native ORC Vectorized (Pushdown) 12868 13051 140 1.2 818.1 1.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 17392 17425 24 0.9 1105.8 1.0X -Parquet Vectorized (Pushdown) 17477 17483 5 0.9 1111.2 1.0X -Native ORC Vectorized 13960 14024 75 1.1 887.6 1.2X -Native ORC Vectorized (Pushdown) 14089 14106 20 1.1 895.8 1.2X +Parquet Vectorized 16147 16263 116 1.0 1026.6 1.0X +Parquet Vectorized (Pushdown) 16195 16288 92 1.0 1029.6 1.0X +Native ORC Vectorized 12616 12881 162 1.2 802.1 1.3X +Native ORC Vectorized (Pushdown) 13134 13262 97 1.2 835.0 1.2X ================================================================================================ @@ -170,58 +170,58 @@ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8190 8235 67 1.9 520.7 1.0X -Parquet Vectorized (Pushdown) 420 426 7 37.4 26.7 19.5X -Native ORC Vectorized 7879 7890 8 2.0 500.9 1.0X -Native ORC Vectorized (Pushdown) 774 782 8 20.3 49.2 10.6X +Parquet Vectorized 7191 7231 39 2.2 457.2 1.0X +Parquet Vectorized (Pushdown) 407 414 8 38.7 25.9 17.7X +Native ORC Vectorized 7215 7373 242 2.2 458.7 1.0X +Native ORC Vectorized (Pushdown) 755 770 13 20.8 48.0 9.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8292 8310 15 1.9 527.2 1.0X -Parquet Vectorized (Pushdown) 424 431 6 37.1 27.0 19.5X -Native ORC Vectorized 8107 8117 10 1.9 515.4 1.0X -Native ORC Vectorized (Pushdown) 776 786 7 20.3 49.3 10.7X +Parquet Vectorized 7354 7380 33 2.1 467.5 1.0X +Parquet Vectorized (Pushdown) 407 414 7 38.6 25.9 18.1X +Native ORC Vectorized 7501 7521 13 2.1 476.9 1.0X +Native ORC Vectorized (Pushdown) 768 777 9 20.5 48.8 9.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8232 8245 8 1.9 523.4 1.0X -Parquet Vectorized (Pushdown) 494 501 8 31.8 31.4 16.7X -Native ORC Vectorized 8073 8094 20 1.9 513.3 1.0X -Native ORC Vectorized (Pushdown) 833 843 6 18.9 53.0 9.9X +Parquet Vectorized 7261 7286 19 2.2 461.7 1.0X +Parquet Vectorized (Pushdown) 466 478 10 33.7 29.6 15.6X +Native ORC Vectorized 7460 7489 37 2.1 474.3 1.0X +Native ORC Vectorized (Pushdown) 813 832 16 19.4 51.7 8.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8209 8232 21 1.9 521.9 1.0X -Parquet Vectorized (Pushdown) 481 494 10 32.7 30.6 17.1X -Native ORC Vectorized 8058 8068 6 2.0 512.3 1.0X -Native ORC Vectorized (Pushdown) 831 840 7 18.9 52.9 9.9X +Parquet Vectorized 7285 7291 7 2.2 463.2 1.0X +Parquet Vectorized (Pushdown) 466 475 11 33.8 29.6 15.6X +Native ORC Vectorized 7439 7468 28 2.1 473.0 1.0X +Native ORC Vectorized (Pushdown) 822 828 6 19.1 52.3 8.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8321 8330 12 1.9 529.0 1.0X -Parquet Vectorized (Pushdown) 488 495 4 32.2 31.0 17.0X -Native ORC Vectorized 8149 8156 9 1.9 518.1 1.0X -Native ORC Vectorized (Pushdown) 841 846 5 18.7 53.5 9.9X +Parquet Vectorized 7398 7416 18 2.1 470.3 1.0X +Parquet Vectorized (Pushdown) 468 473 8 33.6 29.8 15.8X +Native ORC Vectorized 7543 7557 9 2.1 479.6 1.0X +Native ORC Vectorized (Pushdown) 819 836 14 19.2 52.1 9.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 18379 18399 18 0.9 1168.5 1.0X -Parquet Vectorized (Pushdown) 18453 18469 19 0.9 1173.2 1.0X -Native ORC Vectorized 17678 17685 8 0.9 1123.9 1.0X -Native ORC Vectorized (Pushdown) 17936 17943 4 0.9 1140.4 1.0X +Parquet Vectorized 15376 15392 19 1.0 977.6 1.0X +Parquet Vectorized (Pushdown) 15450 15476 25 1.0 982.3 1.0X +Native ORC Vectorized 15517 15592 57 1.0 986.5 1.0X +Native ORC Vectorized (Pushdown) 15823 15909 114 1.0 1006.0 1.0X ================================================================================================ @@ -229,31 +229,31 @@ Pushdown benchmark for StringStartsWith ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9928 9964 43 1.6 631.2 1.0X -Parquet Vectorized (Pushdown) 1349 1367 25 11.7 85.8 7.4X -Native ORC Vectorized 7484 7554 81 2.1 475.8 1.3X -Native ORC Vectorized (Pushdown) 7588 7598 8 2.1 482.4 1.3X +Parquet Vectorized 8553 8633 92 1.8 543.8 1.0X +Parquet Vectorized (Pushdown) 1250 1278 31 12.6 79.5 6.8X +Native ORC Vectorized 7203 7222 20 2.2 458.0 1.2X +Native ORC Vectorized (Pushdown) 7335 7345 16 2.1 466.3 1.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9751 9781 30 1.6 620.0 1.0X -Parquet Vectorized (Pushdown) 493 500 6 31.9 31.4 19.8X -Native ORC Vectorized 7231 7254 15 2.2 459.7 1.3X -Native ORC Vectorized (Pushdown) 7359 7398 25 2.1 467.9 1.3X +Parquet Vectorized 8836 8876 38 1.8 561.8 1.0X +Parquet Vectorized (Pushdown) 472 483 11 33.3 30.0 18.7X +Native ORC Vectorized 6921 6961 55 2.3 440.0 1.3X +Native ORC Vectorized (Pushdown) 7065 7104 32 2.2 449.2 1.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9755 9772 15 1.6 620.2 1.0X -Parquet Vectorized (Pushdown) 483 492 8 32.6 30.7 20.2X -Native ORC Vectorized 7212 7341 230 2.2 458.5 1.4X -Native ORC Vectorized (Pushdown) 7356 7393 21 2.1 467.7 1.3X +Parquet Vectorized 8757 8796 38 1.8 556.7 1.0X +Parquet Vectorized (Pushdown) 463 470 7 34.0 29.4 18.9X +Native ORC Vectorized 6837 6876 27 2.3 434.7 1.3X +Native ORC Vectorized (Pushdown) 7021 7039 13 2.2 446.4 1.2X ================================================================================================ @@ -261,31 +261,31 @@ Pushdown benchmark for StringEndsWith ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8261 8282 27 1.9 525.2 1.0X -Parquet Vectorized (Pushdown) 595 603 6 26.4 37.8 13.9X -Native ORC Vectorized 8112 8128 13 1.9 515.7 1.0X -Native ORC Vectorized (Pushdown) 8368 8375 5 1.9 532.1 1.0X +Parquet Vectorized 7297 7372 156 2.2 463.9 1.0X +Parquet Vectorized (Pushdown) 576 589 15 27.3 36.6 12.7X +Native ORC Vectorized 7444 7463 24 2.1 473.3 1.0X +Native ORC Vectorized (Pushdown) 7713 7722 7 2.0 490.4 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8208 8223 13 1.9 521.9 1.0X -Parquet Vectorized (Pushdown) 471 478 5 33.4 30.0 17.4X -Native ORC Vectorized 8043 8056 11 2.0 511.4 1.0X -Native ORC Vectorized (Pushdown) 8305 8314 11 1.9 528.0 1.0X +Parquet Vectorized 7211 7229 14 2.2 458.5 1.0X +Parquet Vectorized (Pushdown) 464 474 8 33.9 29.5 15.5X +Native ORC Vectorized 7383 7468 111 2.1 469.4 1.0X +Native ORC Vectorized (Pushdown) 7623 7638 16 2.1 484.7 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8197 8221 24 1.9 521.2 1.0X -Parquet Vectorized (Pushdown) 465 475 11 33.8 29.6 17.6X -Native ORC Vectorized 8037 8049 15 2.0 511.0 1.0X -Native ORC Vectorized (Pushdown) 8308 8313 4 1.9 528.2 1.0X +Parquet Vectorized 7197 7211 12 2.2 457.6 1.0X +Parquet Vectorized (Pushdown) 465 473 5 33.8 29.6 15.5X +Native ORC Vectorized 7350 7356 6 2.1 467.3 1.0X +Native ORC Vectorized (Pushdown) 7597 7632 26 2.1 483.0 0.9X ================================================================================================ @@ -293,31 +293,31 @@ Pushdown benchmark for StringContains ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8512 8545 42 1.8 541.2 1.0X -Parquet Vectorized (Pushdown) 1178 1198 22 13.4 74.9 7.2X -Native ORC Vectorized 8335 8349 13 1.9 529.9 1.0X -Native ORC Vectorized (Pushdown) 8590 8604 12 1.8 546.1 1.0X +Parquet Vectorized 7463 7509 58 2.1 474.5 1.0X +Parquet Vectorized (Pushdown) 1098 1118 12 14.3 69.8 6.8X +Native ORC Vectorized 7634 7678 30 2.1 485.4 1.0X +Native ORC Vectorized (Pushdown) 7891 7901 13 2.0 501.7 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8189 8215 26 1.9 520.7 1.0X -Parquet Vectorized (Pushdown) 463 473 8 34.0 29.4 17.7X -Native ORC Vectorized 8036 8041 5 2.0 510.9 1.0X -Native ORC Vectorized (Pushdown) 8292 8301 9 1.9 527.2 1.0X +Parquet Vectorized 7209 7250 32 2.2 458.4 1.0X +Parquet Vectorized (Pushdown) 461 468 6 34.1 29.3 15.6X +Native ORC Vectorized 7390 7402 13 2.1 469.8 1.0X +Native ORC Vectorized (Pushdown) 7631 7648 29 2.1 485.2 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8203 8216 12 1.9 521.5 1.0X -Parquet Vectorized (Pushdown) 464 472 9 33.9 29.5 17.7X -Native ORC Vectorized 8027 8036 5 2.0 510.3 1.0X -Native ORC Vectorized (Pushdown) 8301 8311 10 1.9 527.8 1.0X +Parquet Vectorized 7201 7455 367 2.2 457.8 1.0X +Parquet Vectorized (Pushdown) 472 496 25 33.3 30.0 15.3X +Native ORC Vectorized 7613 7755 85 2.1 484.0 0.9X +Native ORC Vectorized (Pushdown) 7664 7867 186 2.1 487.2 0.9X ================================================================================================ @@ -325,112 +325,112 @@ Pushdown benchmark for decimal ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3552 3561 14 4.4 225.8 1.0X -Parquet Vectorized (Pushdown) 113 118 5 138.8 7.2 31.3X -Native ORC Vectorized 4946 4953 8 3.2 314.5 0.7X -Native ORC Vectorized (Pushdown) 142 150 7 110.6 9.0 25.0X +Parquet Vectorized 3208 3212 7 4.9 203.9 1.0X +Parquet Vectorized (Pushdown) 114 119 6 138.5 7.2 28.2X +Native ORC Vectorized 4553 4575 28 3.5 289.4 0.7X +Native ORC Vectorized (Pushdown) 144 150 7 109.3 9.1 22.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5246 5255 6 3.0 333.5 1.0X -Parquet Vectorized (Pushdown) 2472 2480 8 6.4 157.2 2.1X -Native ORC Vectorized 6648 6652 4 2.4 422.7 0.8X -Native ORC Vectorized (Pushdown) 2797 2808 13 5.6 177.8 1.9X +Parquet Vectorized 4653 4657 3 3.4 295.8 1.0X +Parquet Vectorized (Pushdown) 2185 2212 33 7.2 138.9 2.1X +Native ORC Vectorized 6032 6205 187 2.6 383.5 0.8X +Native ORC Vectorized (Pushdown) 2461 2476 18 6.4 156.5 1.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10788 10805 16 1.5 685.9 1.0X -Parquet Vectorized (Pushdown) 10330 10339 6 1.5 656.8 1.0X -Native ORC Vectorized 12284 12352 132 1.3 781.0 0.9X -Native ORC Vectorized (Pushdown) 11707 11723 12 1.3 744.3 0.9X +Parquet Vectorized 9433 9447 12 1.7 599.7 1.0X +Parquet Vectorized (Pushdown) 9060 9067 6 1.7 576.0 1.0X +Native ORC Vectorized 10720 10734 12 1.5 681.6 0.9X +Native ORC Vectorized (Pushdown) 10154 10174 12 1.5 645.6 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 12126 12143 17 1.3 771.0 1.0X -Parquet Vectorized (Pushdown) 12179 12197 19 1.3 774.3 1.0X -Native ORC Vectorized 13729 13745 12 1.1 872.8 0.9X -Native ORC Vectorized (Pushdown) 13791 13814 17 1.1 876.8 0.9X +Parquet Vectorized 10574 10580 10 1.5 672.3 1.0X +Parquet Vectorized (Pushdown) 10631 10642 17 1.5 675.9 1.0X +Native ORC Vectorized 11899 11915 11 1.3 756.5 0.9X +Native ORC Vectorized (Pushdown) 11963 12454 610 1.3 760.6 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3764 3784 16 4.2 239.3 1.0X -Parquet Vectorized (Pushdown) 115 121 7 137.1 7.3 32.8X -Native ORC Vectorized 4941 4960 21 3.2 314.2 0.8X -Native ORC Vectorized (Pushdown) 139 145 5 112.9 8.9 27.0X +Parquet Vectorized 3363 3387 38 4.7 213.8 1.0X +Parquet Vectorized (Pushdown) 113 118 5 138.9 7.2 29.7X +Native ORC Vectorized 4569 4595 22 3.4 290.5 0.7X +Native ORC Vectorized (Pushdown) 140 146 8 112.3 8.9 24.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4659 4674 11 3.4 296.2 1.0X -Parquet Vectorized (Pushdown) 1356 1368 16 11.6 86.2 3.4X -Native ORC Vectorized 5857 5876 15 2.7 372.4 0.8X -Native ORC Vectorized (Pushdown) 1501 1509 11 10.5 95.4 3.1X +Parquet Vectorized 4116 4143 28 3.8 261.7 1.0X +Parquet Vectorized (Pushdown) 1211 1221 9 13.0 77.0 3.4X +Native ORC Vectorized 5347 5360 9 2.9 339.9 0.8X +Native ORC Vectorized (Pushdown) 1299 1316 18 12.1 82.6 3.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8098 8110 8 1.9 514.9 1.0X -Parquet Vectorized (Pushdown) 6292 6301 8 2.5 400.0 1.3X -Native ORC Vectorized 9416 9433 14 1.7 598.7 0.9X -Native ORC Vectorized (Pushdown) 7014 7032 11 2.2 445.9 1.2X +Parquet Vectorized 7039 7056 12 2.2 447.5 1.0X +Parquet Vectorized (Pushdown) 5425 5453 27 2.9 344.9 1.3X +Native ORC Vectorized 8154 8186 28 1.9 518.4 0.9X +Native ORC Vectorized (Pushdown) 5942 5953 8 2.6 377.8 1.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11443 11455 11 1.4 727.5 1.0X -Parquet Vectorized (Pushdown) 11093 11118 17 1.4 705.3 1.0X -Native ORC Vectorized 12868 12886 23 1.2 818.1 0.9X -Native ORC Vectorized (Pushdown) 12401 12422 14 1.3 788.5 0.9X +Parquet Vectorized 9870 9888 14 1.6 627.5 1.0X +Parquet Vectorized (Pushdown) 9582 9601 23 1.6 609.2 1.0X +Native ORC Vectorized 10918 10936 22 1.4 694.1 0.9X +Native ORC Vectorized (Pushdown) 10525 10539 20 1.5 669.2 0.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5430 5440 6 2.9 345.2 1.0X -Parquet Vectorized (Pushdown) 126 130 4 124.7 8.0 43.0X -Native ORC Vectorized 4972 4986 27 3.2 316.1 1.1X -Native ORC Vectorized (Pushdown) 138 143 4 114.4 8.7 39.5X +Parquet Vectorized 5161 5181 36 3.0 328.1 1.0X +Parquet Vectorized (Pushdown) 124 130 6 127.1 7.9 41.7X +Native ORC Vectorized 4590 4607 15 3.4 291.8 1.1X +Native ORC Vectorized (Pushdown) 138 146 7 113.8 8.8 37.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6513 6531 23 2.4 414.1 1.0X -Parquet Vectorized (Pushdown) 1765 1772 6 8.9 112.2 3.7X -Native ORC Vectorized 6049 6053 3 2.6 384.6 1.1X -Native ORC Vectorized (Pushdown) 1675 1684 9 9.4 106.5 3.9X +Parquet Vectorized 6043 6064 28 2.6 384.2 1.0X +Parquet Vectorized (Pushdown) 1516 1543 28 10.4 96.4 4.0X +Native ORC Vectorized 5451 5477 22 2.9 346.6 1.1X +Native ORC Vectorized (Pushdown) 1435 1452 23 11.0 91.2 4.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10962 10974 9 1.4 697.0 1.0X -Parquet Vectorized (Pushdown) 8358 8368 9 1.9 531.4 1.3X -Native ORC Vectorized 10319 10343 31 1.5 656.1 1.1X -Native ORC Vectorized (Pushdown) 7923 7933 7 2.0 503.7 1.4X +Parquet Vectorized 9566 9662 197 1.6 608.2 1.0X +Parquet Vectorized (Pushdown) 7071 7227 254 2.2 449.6 1.4X +Native ORC Vectorized 8834 8871 29 1.8 561.6 1.1X +Native ORC Vectorized (Pushdown) 6618 6652 31 2.4 420.8 1.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 15327 15330 2 1.0 974.5 1.0X -Parquet Vectorized (Pushdown) 14830 14844 9 1.1 942.9 1.0X -Native ORC Vectorized 14520 14568 91 1.1 923.2 1.1X -Native ORC Vectorized (Pushdown) 14066 14080 10 1.1 894.3 1.1X +Parquet Vectorized 13014 13039 25 1.2 827.4 1.0X +Parquet Vectorized (Pushdown) 12576 12585 9 1.3 799.5 1.0X +Native ORC Vectorized 12165 12176 15 1.3 773.5 1.1X +Native ORC Vectorized (Pushdown) 11757 11799 25 1.3 747.5 1.1X ================================================================================================ @@ -438,112 +438,112 @@ Pushdown benchmark for InSet -> InFilters ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9158 9202 40 1.7 582.3 1.0X -Parquet Vectorized (Pushdown) 494 502 7 31.9 31.4 18.6X -Native ORC Vectorized 6735 6795 81 2.3 428.2 1.4X -Native ORC Vectorized (Pushdown) 427 430 6 36.8 27.2 21.4X +Parquet Vectorized 8131 8157 25 1.9 517.0 1.0X +Parquet Vectorized (Pushdown) 471 480 11 33.4 30.0 17.3X +Native ORC Vectorized 6203 6290 126 2.5 394.4 1.3X +Native ORC Vectorized (Pushdown) 421 427 6 37.4 26.8 19.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9194 9207 12 1.7 584.5 1.0X -Parquet Vectorized (Pushdown) 494 500 6 31.8 31.4 18.6X -Native ORC Vectorized 6727 6755 22 2.3 427.7 1.4X -Native ORC Vectorized (Pushdown) 427 434 10 36.9 27.1 21.6X +Parquet Vectorized 7985 7995 9 2.0 507.7 1.0X +Parquet Vectorized (Pushdown) 468 475 9 33.6 29.8 17.1X +Native ORC Vectorized 6211 6264 32 2.5 394.9 1.3X +Native ORC Vectorized (Pushdown) 422 431 12 37.3 26.8 18.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9190 9207 15 1.7 584.3 1.0X -Parquet Vectorized (Pushdown) 502 507 4 31.4 31.9 18.3X -Native ORC Vectorized 6717 6747 20 2.3 427.0 1.4X -Native ORC Vectorized (Pushdown) 429 437 7 36.7 27.3 21.4X +Parquet Vectorized 8147 8159 12 1.9 518.0 1.0X +Parquet Vectorized (Pushdown) 470 477 8 33.5 29.9 17.4X +Native ORC Vectorized 6085 6121 36 2.6 386.9 1.3X +Native ORC Vectorized (Pushdown) 418 430 9 37.6 26.6 19.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9243 9264 25 1.7 587.6 1.0X -Parquet Vectorized (Pushdown) 525 535 7 29.9 33.4 17.6X -Native ORC Vectorized 6741 6761 14 2.3 428.6 1.4X -Native ORC Vectorized (Pushdown) 452 459 7 34.8 28.7 20.5X +Parquet Vectorized 7989 8021 24 2.0 507.9 1.0X +Parquet Vectorized (Pushdown) 489 497 10 32.2 31.1 16.4X +Native ORC Vectorized 6204 6264 67 2.5 394.4 1.3X +Native ORC Vectorized (Pushdown) 438 444 9 35.9 27.9 18.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9238 9279 81 1.7 587.3 1.0X -Parquet Vectorized (Pushdown) 519 530 12 30.3 33.0 17.8X -Native ORC Vectorized 6741 6784 48 2.3 428.6 1.4X -Native ORC Vectorized (Pushdown) 449 457 6 35.0 28.6 20.6X +Parquet Vectorized 8138 8151 9 1.9 517.4 1.0X +Parquet Vectorized (Pushdown) 494 500 8 31.9 31.4 16.5X +Native ORC Vectorized 6124 6131 4 2.6 389.4 1.3X +Native ORC Vectorized (Pushdown) 442 452 11 35.6 28.1 18.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9224 9250 26 1.7 586.5 1.0X -Parquet Vectorized (Pushdown) 519 526 6 30.3 33.0 17.8X -Native ORC Vectorized 6734 6766 20 2.3 428.1 1.4X -Native ORC Vectorized (Pushdown) 449 456 6 35.0 28.5 20.6X +Parquet Vectorized 8133 8147 15 1.9 517.1 1.0X +Parquet Vectorized (Pushdown) 492 505 14 32.0 31.3 16.5X +Native ORC Vectorized 6213 6247 28 2.5 395.0 1.3X +Native ORC Vectorized (Pushdown) 438 449 11 35.9 27.9 18.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9491 9500 9 1.7 603.4 1.0X -Parquet Vectorized (Pushdown) 1294 1329 31 12.2 82.3 7.3X -Native ORC Vectorized 7033 7051 21 2.2 447.1 1.3X -Native ORC Vectorized (Pushdown) 582 588 7 27.0 37.0 16.3X +Parquet Vectorized 8441 8458 13 1.9 536.6 1.0X +Parquet Vectorized (Pushdown) 1212 1251 33 13.0 77.1 7.0X +Native ORC Vectorized 6510 6563 34 2.4 413.9 1.3X +Native ORC Vectorized (Pushdown) 578 587 10 27.2 36.7 14.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9503 9520 16 1.7 604.2 1.0X -Parquet Vectorized (Pushdown) 4804 4842 37 3.3 305.5 2.0X -Native ORC Vectorized 7057 7085 24 2.2 448.7 1.3X -Native ORC Vectorized (Pushdown) 612 618 7 25.7 38.9 15.5X +Parquet Vectorized 8427 8449 24 1.9 535.8 1.0X +Parquet Vectorized (Pushdown) 4153 4246 87 3.8 264.1 2.0X +Native ORC Vectorized 6542 6565 22 2.4 415.9 1.3X +Native ORC Vectorized (Pushdown) 585 597 10 26.9 37.2 14.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9501 9510 10 1.7 604.0 1.0X -Parquet Vectorized (Pushdown) 8354 8370 11 1.9 531.1 1.1X -Native ORC Vectorized 7058 7090 27 2.2 448.7 1.3X -Native ORC Vectorized (Pushdown) 627 631 3 25.1 39.9 15.2X +Parquet Vectorized 8442 8469 31 1.9 536.7 1.0X +Parquet Vectorized (Pushdown) 7538 7567 17 2.1 479.2 1.1X +Native ORC Vectorized 6466 6531 44 2.4 411.1 1.3X +Native ORC Vectorized (Pushdown) 594 616 19 26.5 37.7 14.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9423 9450 18 1.7 599.1 1.0X -Parquet Vectorized (Pushdown) 1338 1375 30 11.8 85.1 7.0X -Native ORC Vectorized 6988 7010 13 2.3 444.3 1.3X -Native ORC Vectorized (Pushdown) 738 752 20 21.3 46.9 12.8X +Parquet Vectorized 8797 8882 77 1.8 559.3 1.0X +Parquet Vectorized (Pushdown) 1230 1290 46 12.8 78.2 7.2X +Native ORC Vectorized 6641 6754 80 2.4 422.2 1.3X +Native ORC Vectorized (Pushdown) 711 761 33 22.1 45.2 12.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9435 9468 23 1.7 599.9 1.0X -Parquet Vectorized (Pushdown) 4934 4972 22 3.2 313.7 1.9X -Native ORC Vectorized 6969 7001 24 2.3 443.1 1.4X -Native ORC Vectorized (Pushdown) 800 813 21 19.7 50.9 11.8X +Parquet Vectorized 8384 8595 130 1.9 533.0 1.0X +Parquet Vectorized (Pushdown) 4384 4415 20 3.6 278.7 1.9X +Native ORC Vectorized 6463 6483 30 2.4 410.9 1.3X +Native ORC Vectorized (Pushdown) 762 775 13 20.6 48.4 11.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9460 9466 9 1.7 601.4 1.0X -Parquet Vectorized (Pushdown) 8537 8562 18 1.8 542.8 1.1X -Native ORC Vectorized 6991 7015 17 2.2 444.5 1.4X -Native ORC Vectorized (Pushdown) 826 833 6 19.0 52.5 11.4X +Parquet Vectorized 8366 8394 19 1.9 531.9 1.0X +Parquet Vectorized (Pushdown) 7576 7599 23 2.1 481.7 1.1X +Native ORC Vectorized 6424 6467 28 2.4 408.4 1.3X +Native ORC Vectorized (Pushdown) 767 778 13 20.5 48.8 10.9X ================================================================================================ @@ -551,40 +551,40 @@ Pushdown benchmark for tinyint ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3993 4003 11 3.9 253.9 1.0X -Parquet Vectorized (Pushdown) 163 169 7 96.5 10.4 24.5X -Native ORC Vectorized 3381 3411 26 4.7 214.9 1.2X -Native ORC Vectorized (Pushdown) 193 201 6 81.4 12.3 20.7X +Parquet Vectorized 3539 3551 16 4.4 225.0 1.0X +Parquet Vectorized (Pushdown) 157 163 7 100.4 10.0 22.6X +Native ORC Vectorized 2811 2875 60 5.6 178.7 1.3X +Native ORC Vectorized (Pushdown) 190 200 7 82.7 12.1 18.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4725 4747 42 3.3 300.4 1.0X -Parquet Vectorized (Pushdown) 1244 1254 10 12.6 79.1 3.8X -Native ORC Vectorized 4060 4087 34 3.9 258.1 1.2X -Native ORC Vectorized (Pushdown) 1163 1177 10 13.5 73.9 4.1X +Parquet Vectorized 4214 4224 10 3.7 267.9 1.0X +Parquet Vectorized (Pushdown) 1107 1133 19 14.2 70.4 3.8X +Native ORC Vectorized 3391 3456 76 4.6 215.6 1.2X +Native ORC Vectorized (Pushdown) 1058 1095 32 14.9 67.3 4.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7870 7878 8 2.0 500.4 1.0X -Parquet Vectorized (Pushdown) 5961 5966 3 2.6 379.0 1.3X -Native ORC Vectorized 6851 6884 29 2.3 435.6 1.1X -Native ORC Vectorized (Pushdown) 5311 5339 29 3.0 337.7 1.5X +Parquet Vectorized 7324 7391 60 2.1 465.6 1.0X +Parquet Vectorized (Pushdown) 5544 5594 62 2.8 352.5 1.3X +Native ORC Vectorized 6125 6175 50 2.6 389.4 1.2X +Native ORC Vectorized (Pushdown) 4798 4904 88 3.3 305.0 1.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10976 11000 14 1.4 697.8 1.0X -Parquet Vectorized (Pushdown) 10637 10661 22 1.5 676.3 1.0X -Native ORC Vectorized 9730 9749 16 1.6 618.6 1.1X -Native ORC Vectorized (Pushdown) 9488 9494 3 1.7 603.3 1.2X +Parquet Vectorized 9509 10049 337 1.7 604.5 1.0X +Parquet Vectorized (Pushdown) 9224 9246 16 1.7 586.4 1.0X +Native ORC Vectorized 8402 8589 179 1.9 534.2 1.1X +Native ORC Vectorized (Pushdown) 8205 8219 16 1.9 521.6 1.2X ================================================================================================ @@ -592,112 +592,112 @@ Pushdown benchmark for Timestamp ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4024 4033 8 3.9 255.9 1.0X -Parquet Vectorized (Pushdown) 4038 4058 15 3.9 256.7 1.0X -Native ORC Vectorized 3374 3392 20 4.7 214.5 1.2X -Native ORC Vectorized (Pushdown) 117 122 5 134.9 7.4 34.5X +Parquet Vectorized 3873 3987 91 4.1 246.2 1.0X +Parquet Vectorized (Pushdown) 3882 3956 43 4.1 246.8 1.0X +Native ORC Vectorized 2790 2830 43 5.6 177.4 1.4X +Native ORC Vectorized (Pushdown) 119 130 6 131.7 7.6 32.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4876 4889 10 3.2 310.0 1.0X -Parquet Vectorized (Pushdown) 4889 4895 5 3.2 310.9 1.0X -Native ORC Vectorized 4155 4165 13 3.8 264.2 1.2X -Native ORC Vectorized (Pushdown) 1240 1252 8 12.7 78.8 3.9X +Parquet Vectorized 4646 4708 36 3.4 295.4 1.0X +Parquet Vectorized (Pushdown) 4406 4559 139 3.6 280.2 1.1X +Native ORC Vectorized 3433 3449 18 4.6 218.3 1.4X +Native ORC Vectorized (Pushdown) 1075 1092 15 14.6 68.4 4.3X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8161 8185 16 1.9 518.9 1.0X -Parquet Vectorized (Pushdown) 8167 8179 8 1.9 519.2 1.0X -Native ORC Vectorized 7253 7265 9 2.2 461.1 1.1X -Native ORC Vectorized (Pushdown) 5636 5644 10 2.8 358.4 1.4X +Parquet Vectorized 7207 7467 211 2.2 458.2 1.0X +Parquet Vectorized (Pushdown) 7161 7242 147 2.2 455.3 1.0X +Native ORC Vectorized 6035 6055 26 2.6 383.7 1.2X +Native ORC Vectorized (Pushdown) 4767 4879 140 3.3 303.1 1.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11364 11372 10 1.4 722.5 1.0X -Parquet Vectorized (Pushdown) 11372 11376 4 1.4 723.0 1.0X -Native ORC Vectorized 10310 10321 8 1.5 655.5 1.1X -Native ORC Vectorized (Pushdown) 10029 10044 12 1.6 637.7 1.1X +Parquet Vectorized 9914 10065 275 1.6 630.3 1.0X +Parquet Vectorized (Pushdown) 10378 10488 98 1.5 659.8 1.0X +Native ORC Vectorized 9198 9348 190 1.7 584.8 1.1X +Native ORC Vectorized (Pushdown) 8597 8789 138 1.8 546.6 1.2X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3688 3702 10 4.3 234.5 1.0X -Parquet Vectorized (Pushdown) 118 122 5 132.9 7.5 31.2X -Native ORC Vectorized 3358 3369 19 4.7 213.5 1.1X -Native ORC Vectorized (Pushdown) 116 120 5 136.0 7.4 31.9X +Parquet Vectorized 3365 3442 69 4.7 213.9 1.0X +Parquet Vectorized (Pushdown) 116 126 6 136.0 7.4 29.1X +Native ORC Vectorized 2748 2766 20 5.7 174.7 1.2X +Native ORC Vectorized (Pushdown) 116 123 7 135.1 7.4 28.9X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4515 4529 10 3.5 287.1 1.0X -Parquet Vectorized (Pushdown) 1298 1312 11 12.1 82.5 3.5X -Native ORC Vectorized 4159 4167 4 3.8 264.4 1.1X -Native ORC Vectorized (Pushdown) 1234 1235 1 12.7 78.5 3.7X +Parquet Vectorized 3972 3979 9 4.0 252.6 1.0X +Parquet Vectorized (Pushdown) 1139 1161 19 13.8 72.4 3.5X +Native ORC Vectorized 3445 3461 21 4.6 219.1 1.2X +Native ORC Vectorized (Pushdown) 1068 1081 15 14.7 67.9 3.7X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7786 7806 16 2.0 495.0 1.0X -Parquet Vectorized (Pushdown) 6041 6047 4 2.6 384.1 1.3X -Native ORC Vectorized 7243 7256 8 2.2 460.5 1.1X -Native ORC Vectorized (Pushdown) 5638 5645 9 2.8 358.5 1.4X +Parquet Vectorized 7042 7208 130 2.2 447.7 1.0X +Parquet Vectorized (Pushdown) 5472 5566 93 2.9 347.9 1.3X +Native ORC Vectorized 6205 6368 95 2.5 394.5 1.1X +Native ORC Vectorized (Pushdown) 5021 5045 28 3.1 319.2 1.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10990 10999 11 1.4 698.7 1.0X -Parquet Vectorized (Pushdown) 10674 10680 7 1.5 678.7 1.0X -Native ORC Vectorized 10307 10319 12 1.5 655.3 1.1X -Native ORC Vectorized (Pushdown) 10041 10048 4 1.6 638.4 1.1X +Parquet Vectorized 9466 9791 249 1.7 601.8 1.0X +Parquet Vectorized (Pushdown) 9680 9759 49 1.6 615.4 1.0X +Native ORC Vectorized 9009 9066 39 1.7 572.8 1.1X +Native ORC Vectorized (Pushdown) 8660 8817 94 1.8 550.6 1.1X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3716 3755 71 4.2 236.3 1.0X -Parquet Vectorized (Pushdown) 114 118 4 138.5 7.2 32.7X -Native ORC Vectorized 3347 3355 8 4.7 212.8 1.1X -Native ORC Vectorized (Pushdown) 113 117 4 138.8 7.2 32.8X +Parquet Vectorized 3707 3719 18 4.2 235.7 1.0X +Parquet Vectorized (Pushdown) 125 131 8 125.7 8.0 29.6X +Native ORC Vectorized 2950 2961 13 5.3 187.5 1.3X +Native ORC Vectorized (Pushdown) 118 126 6 133.4 7.5 31.4X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4546 4557 13 3.5 289.0 1.0X -Parquet Vectorized (Pushdown) 1298 1316 13 12.1 82.5 3.5X -Native ORC Vectorized 4140 4148 10 3.8 263.2 1.1X -Native ORC Vectorized (Pushdown) 1230 1242 8 12.8 78.2 3.7X +Parquet Vectorized 4166 4283 74 3.8 264.9 1.0X +Parquet Vectorized (Pushdown) 1180 1241 59 13.3 75.1 3.5X +Native ORC Vectorized 3728 3732 4 4.2 237.0 1.1X +Native ORC Vectorized (Pushdown) 1172 1189 15 13.4 74.5 3.6X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7826 7841 17 2.0 497.6 1.0X -Parquet Vectorized (Pushdown) 6053 6058 5 2.6 384.8 1.3X -Native ORC Vectorized 7235 7238 5 2.2 460.0 1.1X -Native ORC Vectorized (Pushdown) 5642 5653 8 2.8 358.7 1.4X +Parquet Vectorized 7631 7638 7 2.1 485.1 1.0X +Parquet Vectorized (Pushdown) 5866 5872 6 2.7 372.9 1.3X +Native ORC Vectorized 6638 6644 6 2.4 422.0 1.1X +Native ORC Vectorized (Pushdown) 5237 5250 20 3.0 332.9 1.5X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10998 11011 8 1.4 699.2 1.0X -Parquet Vectorized (Pushdown) 10684 10693 6 1.5 679.3 1.0X -Native ORC Vectorized 10296 10307 9 1.5 654.6 1.1X -Native ORC Vectorized (Pushdown) 10024 10028 6 1.6 637.3 1.1X +Parquet Vectorized 10677 10697 14 1.5 678.8 1.0X +Parquet Vectorized (Pushdown) 9233 9588 317 1.7 587.0 1.2X +Native ORC Vectorized 8630 8634 5 1.8 548.7 1.2X +Native ORC Vectorized (Pushdown) 8399 8410 14 1.9 534.0 1.3X ================================================================================================ @@ -705,30 +705,30 @@ Pushdown benchmark with many filters ================================================================================================ OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 141 147 5 0.0 141156296.0 1.0X -Parquet Vectorized (Pushdown) 141 146 4 0.0 141219096.0 1.0X -Native ORC Vectorized 135 139 5 0.0 135256253.0 1.0X -Native ORC Vectorized (Pushdown) 145 150 5 0.0 144968981.0 1.0X +Parquet Vectorized 141 148 6 0.0 140547292.0 1.0X +Parquet Vectorized (Pushdown) 142 148 7 0.0 142378538.0 1.0X +Native ORC Vectorized 133 137 6 0.0 132552704.0 1.1X +Native ORC Vectorized (Pushdown) 141 146 6 0.0 141013626.0 1.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1547 1593 60 0.0 1546942527.0 1.0X -Parquet Vectorized (Pushdown) 1613 1648 37 0.0 1612664638.0 1.0X -Native ORC Vectorized 1553 1593 54 0.0 1552761511.0 1.0X -Native ORC Vectorized (Pushdown) 1575 1619 47 0.0 1574744637.0 1.0X +Parquet Vectorized 1457 1513 76 0.0 1457117756.0 1.0X +Parquet Vectorized (Pushdown) 1504 1560 100 0.0 1503940716.0 1.0X +Native ORC Vectorized 1457 1509 53 0.0 1456613526.0 1.0X +Native ORC Vectorized (Pushdown) 1472 1534 80 0.0 1472108739.0 1.0X OpenJDK 64-Bit Server VM 11.0.15+10-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 6791 6927 175 0.0 6790722699.0 1.0X -Parquet Vectorized (Pushdown) 6996 7118 208 0.0 6996423296.0 1.0X -Native ORC Vectorized 6785 6897 203 0.0 6785074307.0 1.0X -Native ORC Vectorized (Pushdown) 6859 6945 159 0.0 6859258499.0 1.0X +Parquet Vectorized 6439 6519 144 0.0 6439334268.0 1.0X +Parquet Vectorized (Pushdown) 6584 6693 167 0.0 6583512270.0 1.0X +Native ORC Vectorized 6414 6541 178 0.0 6413605277.0 1.0X +Native ORC Vectorized (Pushdown) 6496 6600 204 0.0 6496433640.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt index 6663cc6d10..07ee80fa4f 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-jdk17-results.txt @@ -3,166 +3,166 @@ Pushdown for many distinct value case ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8574 8745 248 1.8 545.1 1.0X -Parquet Vectorized (Pushdown) 521 540 14 30.2 33.1 16.4X -Native ORC Vectorized 7117 7171 76 2.2 452.5 1.2X -Native ORC Vectorized (Pushdown) 499 507 15 31.6 31.7 17.2X +Parquet Vectorized 9638 9872 198 1.6 612.8 1.0X +Parquet Vectorized (Pushdown) 687 710 23 22.9 43.7 14.0X +Native ORC Vectorized 7239 7353 107 2.2 460.2 1.3X +Native ORC Vectorized (Pushdown) 581 617 23 27.1 37.0 16.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8658 8673 12 1.8 550.4 1.0X -Parquet Vectorized (Pushdown) 498 509 17 31.6 31.6 17.4X -Native ORC Vectorized 7231 7238 8 2.2 459.7 1.2X -Native ORC Vectorized (Pushdown) 494 506 13 31.8 31.4 17.5X +Parquet Vectorized 9870 10102 312 1.6 627.5 1.0X +Parquet Vectorized (Pushdown) 620 659 28 25.4 39.4 15.9X +Native ORC Vectorized 7417 7488 78 2.1 471.5 1.3X +Native ORC Vectorized (Pushdown) 582 630 43 27.0 37.0 17.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8627 8642 13 1.8 548.5 1.0X -Parquet Vectorized (Pushdown) 496 509 13 31.7 31.5 17.4X -Native ORC Vectorized 7203 7212 11 2.2 457.9 1.2X -Native ORC Vectorized (Pushdown) 494 498 8 31.9 31.4 17.5X +Parquet Vectorized 9776 10028 278 1.6 621.5 1.0X +Parquet Vectorized (Pushdown) 672 703 40 23.4 42.7 14.5X +Native ORC Vectorized 7540 7756 209 2.1 479.4 1.3X +Native ORC Vectorized (Pushdown) 604 627 20 26.0 38.4 16.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8579 8612 32 1.8 545.4 1.0X -Parquet Vectorized (Pushdown) 479 487 7 32.8 30.5 17.9X -Native ORC Vectorized 7194 7211 17 2.2 457.4 1.2X -Native ORC Vectorized (Pushdown) 474 480 6 33.2 30.1 18.1X +Parquet Vectorized 9710 9958 311 1.6 617.3 1.0X +Parquet Vectorized (Pushdown) 590 612 23 26.6 37.5 16.4X +Native ORC Vectorized 7349 7416 90 2.1 467.2 1.3X +Native ORC Vectorized (Pushdown) 571 587 12 27.6 36.3 17.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8637 8644 10 1.8 549.1 1.0X -Parquet Vectorized (Pushdown) 485 500 10 32.5 30.8 17.8X -Native ORC Vectorized 7182 7192 8 2.2 456.6 1.2X -Native ORC Vectorized (Pushdown) 469 474 4 33.5 29.8 18.4X +Parquet Vectorized 9756 10049 264 1.6 620.3 1.0X +Parquet Vectorized (Pushdown) 597 628 52 26.3 38.0 16.3X +Native ORC Vectorized 7314 7405 57 2.2 465.0 1.3X +Native ORC Vectorized (Pushdown) 562 584 14 28.0 35.7 17.4X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 16691 16738 33 0.9 1061.2 1.0X -Parquet Vectorized (Pushdown) 16714 16745 26 0.9 1062.6 1.0X -Native ORC Vectorized 15315 15347 26 1.0 973.7 1.1X -Native ORC Vectorized (Pushdown) 15495 15524 35 1.0 985.1 1.1X +Parquet Vectorized 18640 18879 152 0.8 1185.1 1.0X +Parquet Vectorized (Pushdown) 18579 18806 167 0.8 1181.3 1.0X +Native ORC Vectorized 16363 16693 221 1.0 1040.3 1.1X +Native ORC Vectorized (Pushdown) 16505 16707 178 1.0 1049.4 1.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8048 8096 82 2.0 511.7 1.0X -Parquet Vectorized (Pushdown) 462 473 8 34.1 29.4 17.4X -Native ORC Vectorized 6436 6479 47 2.4 409.2 1.3X -Native ORC Vectorized (Pushdown) 445 453 6 35.4 28.3 18.1X +Parquet Vectorized 9242 9419 119 1.7 587.6 1.0X +Parquet Vectorized (Pushdown) 583 609 28 27.0 37.1 15.9X +Native ORC Vectorized 6669 6786 101 2.4 424.0 1.4X +Native ORC Vectorized (Pushdown) 510 536 17 30.8 32.4 18.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8055 8072 13 2.0 512.1 1.0X -Parquet Vectorized (Pushdown) 471 485 11 33.4 29.9 17.1X -Native ORC Vectorized 6425 6447 15 2.4 408.5 1.3X -Native ORC Vectorized (Pushdown) 456 459 2 34.5 29.0 17.7X +Parquet Vectorized 9179 9234 49 1.7 583.6 1.0X +Parquet Vectorized (Pushdown) 565 619 38 27.8 35.9 16.2X +Native ORC Vectorized 6652 6715 79 2.4 422.9 1.4X +Native ORC Vectorized (Pushdown) 523 556 25 30.0 33.3 17.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8119 8128 11 1.9 516.2 1.0X -Parquet Vectorized (Pushdown) 473 479 4 33.2 30.1 17.1X -Native ORC Vectorized 6488 6497 9 2.4 412.5 1.3X -Native ORC Vectorized (Pushdown) 446 455 6 35.2 28.4 18.2X +Parquet Vectorized 9148 9242 109 1.7 581.6 1.0X +Parquet Vectorized (Pushdown) 578 616 25 27.2 36.8 15.8X +Native ORC Vectorized 6635 6770 98 2.4 421.8 1.4X +Native ORC Vectorized (Pushdown) 530 541 11 29.7 33.7 17.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8098 8105 8 1.9 514.8 1.0X -Parquet Vectorized (Pushdown) 458 464 5 34.3 29.1 17.7X -Native ORC Vectorized 6504 6512 6 2.4 413.5 1.2X -Native ORC Vectorized (Pushdown) 455 461 4 34.6 28.9 17.8X +Parquet Vectorized 9416 9724 263 1.7 598.6 1.0X +Parquet Vectorized (Pushdown) 563 606 36 27.9 35.8 16.7X +Native ORC Vectorized 6714 6785 47 2.3 426.9 1.4X +Native ORC Vectorized (Pushdown) 540 552 11 29.1 34.3 17.4X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8088 8110 20 1.9 514.2 1.0X -Parquet Vectorized (Pushdown) 467 472 4 33.7 29.7 17.3X -Native ORC Vectorized 6486 6497 10 2.4 412.4 1.2X -Native ORC Vectorized (Pushdown) 446 450 3 35.3 28.4 18.1X +Parquet Vectorized 9244 9315 77 1.7 587.7 1.0X +Parquet Vectorized (Pushdown) 587 600 13 26.8 37.3 15.8X +Native ORC Vectorized 6729 6889 123 2.3 427.8 1.4X +Native ORC Vectorized (Pushdown) 536 556 14 29.3 34.1 17.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8117 8135 18 1.9 516.0 1.0X -Parquet Vectorized (Pushdown) 457 465 11 34.4 29.1 17.8X -Native ORC Vectorized 6533 6540 7 2.4 415.4 1.2X -Native ORC Vectorized (Pushdown) 445 450 4 35.3 28.3 18.2X +Parquet Vectorized 8971 9207 146 1.8 570.4 1.0X +Parquet Vectorized (Pushdown) 588 602 17 26.7 37.4 15.2X +Native ORC Vectorized 6633 6746 120 2.4 421.7 1.4X +Native ORC Vectorized (Pushdown) 553 558 4 28.4 35.2 16.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 8924 8937 18 1.8 567.4 1.0X -Parquet Vectorized (Pushdown) 2043 2104 128 7.7 129.9 4.4X -Native ORC Vectorized 7331 7338 6 2.1 466.1 1.2X -Native ORC Vectorized (Pushdown) 1891 1898 4 8.3 120.2 4.7X +Parquet Vectorized 10054 10119 48 1.6 639.2 1.0X +Parquet Vectorized (Pushdown) 2427 2454 21 6.5 154.3 4.1X +Native ORC Vectorized 7586 7681 69 2.1 482.3 1.3X +Native ORC Vectorized (Pushdown) 2044 2117 62 7.7 129.9 4.9X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 11730 11775 49 1.3 745.8 1.0X -Parquet Vectorized (Pushdown) 7984 7998 10 2.0 507.6 1.5X -Native ORC Vectorized 10196 10222 16 1.5 648.3 1.2X -Native ORC Vectorized (Pushdown) 7270 7288 26 2.2 462.2 1.6X +Parquet Vectorized 13052 13180 122 1.2 829.8 1.0X +Parquet Vectorized (Pushdown) 8910 8967 37 1.8 566.5 1.5X +Native ORC Vectorized 10745 11415 394 1.5 683.2 1.2X +Native ORC Vectorized (Pushdown) 7832 7948 155 2.0 497.9 1.7X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 14566 14601 26 1.1 926.1 1.0X -Parquet Vectorized (Pushdown) 13905 13931 28 1.1 884.0 1.0X -Native ORC Vectorized 13103 13143 32 1.2 833.1 1.1X -Native ORC Vectorized (Pushdown) 12662 12673 15 1.2 805.0 1.2X +Parquet Vectorized 16046 16381 283 1.0 1020.2 1.0X +Parquet Vectorized (Pushdown) 15424 15643 158 1.0 980.6 1.0X +Native ORC Vectorized 13950 14093 172 1.1 886.9 1.2X +Native ORC Vectorized (Pushdown) 13315 13460 203 1.2 846.5 1.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15339 15362 13 1.0 975.3 1.0X -Parquet Vectorized (Pushdown) 15433 15446 12 1.0 981.2 1.0X -Native ORC Vectorized 13781 13797 12 1.1 876.1 1.1X -Native ORC Vectorized (Pushdown) 13946 13953 8 1.1 886.7 1.1X +Parquet Vectorized 16964 17054 94 0.9 1078.5 1.0X +Parquet Vectorized (Pushdown) 16894 17307 320 0.9 1074.1 1.0X +Native ORC Vectorized 14383 14560 163 1.1 914.4 1.2X +Native ORC Vectorized (Pushdown) 14472 14879 279 1.1 920.1 1.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15314 15371 48 1.0 973.6 1.0X -Parquet Vectorized (Pushdown) 15440 15451 9 1.0 981.6 1.0X -Native ORC Vectorized 13805 13832 28 1.1 877.7 1.1X -Native ORC Vectorized (Pushdown) 13950 13975 22 1.1 886.9 1.1X +Parquet Vectorized 16890 17099 221 0.9 1073.8 1.0X +Parquet Vectorized (Pushdown) 17008 17148 118 0.9 1081.3 1.0X +Native ORC Vectorized 14596 14805 200 1.1 928.0 1.2X +Native ORC Vectorized (Pushdown) 14714 15031 267 1.1 935.5 1.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 15288 15327 29 1.0 972.0 1.0X -Parquet Vectorized (Pushdown) 15412 15434 13 1.0 979.9 1.0X -Native ORC Vectorized 13805 13824 16 1.1 877.7 1.1X -Native ORC Vectorized (Pushdown) 13997 14021 24 1.1 889.9 1.1X +Parquet Vectorized 16833 17056 193 0.9 1070.2 1.0X +Parquet Vectorized (Pushdown) 16929 17179 224 0.9 1076.3 1.0X +Native ORC Vectorized 14380 14677 221 1.1 914.2 1.2X +Native ORC Vectorized (Pushdown) 14615 14807 151 1.1 929.2 1.2X ================================================================================================ @@ -170,58 +170,58 @@ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7565 7605 35 2.1 481.0 1.0X -Parquet Vectorized (Pushdown) 403 415 13 39.0 25.6 18.8X -Native ORC Vectorized 7768 7781 19 2.0 493.8 1.0X -Native ORC Vectorized (Pushdown) 818 823 3 19.2 52.0 9.2X +Parquet Vectorized 8479 8614 162 1.9 539.1 1.0X +Parquet Vectorized (Pushdown) 489 512 23 32.2 31.1 17.3X +Native ORC Vectorized 7913 7974 55 2.0 503.1 1.1X +Native ORC Vectorized (Pushdown) 984 1015 34 16.0 62.6 8.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7742 7755 10 2.0 492.2 1.0X -Parquet Vectorized (Pushdown) 406 413 5 38.8 25.8 19.1X -Native ORC Vectorized 8019 8030 11 2.0 509.8 1.0X -Native ORC Vectorized (Pushdown) 822 826 3 19.1 52.2 9.4X +Parquet Vectorized 8611 8711 94 1.8 547.4 1.0X +Parquet Vectorized (Pushdown) 465 493 19 33.8 29.5 18.5X +Native ORC Vectorized 8239 8358 101 1.9 523.8 1.0X +Native ORC Vectorized (Pushdown) 999 1024 17 15.7 63.5 8.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7575 7582 6 2.1 481.6 1.0X -Parquet Vectorized (Pushdown) 459 468 7 34.3 29.2 16.5X -Native ORC Vectorized 7934 7944 10 2.0 504.4 1.0X -Native ORC Vectorized (Pushdown) 879 889 16 17.9 55.9 8.6X +Parquet Vectorized 8523 8783 196 1.8 541.9 1.0X +Parquet Vectorized (Pushdown) 554 577 17 28.4 35.2 15.4X +Native ORC Vectorized 8119 8302 123 1.9 516.2 1.0X +Native ORC Vectorized (Pushdown) 1042 1064 17 15.1 66.2 8.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7590 7611 16 2.1 482.6 1.0X -Parquet Vectorized (Pushdown) 457 463 5 34.4 29.0 16.6X -Native ORC Vectorized 7936 7942 6 2.0 504.6 1.0X -Native ORC Vectorized (Pushdown) 873 879 4 18.0 55.5 8.7X +Parquet Vectorized 8567 8681 154 1.8 544.6 1.0X +Parquet Vectorized (Pushdown) 542 568 19 29.0 34.5 15.8X +Native ORC Vectorized 8156 8229 66 1.9 518.5 1.1X +Native ORC Vectorized (Pushdown) 1046 1059 9 15.0 66.5 8.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7715 7724 8 2.0 490.5 1.0X -Parquet Vectorized (Pushdown) 463 471 5 33.9 29.5 16.6X -Native ORC Vectorized 8038 8043 7 2.0 511.1 1.0X -Native ORC Vectorized (Pushdown) 874 884 8 18.0 55.6 8.8X +Parquet Vectorized 8710 8831 137 1.8 553.8 1.0X +Parquet Vectorized (Pushdown) 562 578 15 28.0 35.7 15.5X +Native ORC Vectorized 8415 8547 99 1.9 535.0 1.0X +Native ORC Vectorized (Pushdown) 1048 1076 21 15.0 66.6 8.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 16502 16555 40 1.0 1049.1 1.0X -Parquet Vectorized (Pushdown) 16597 16624 20 0.9 1055.2 1.0X -Native ORC Vectorized 17396 17568 116 0.9 1106.0 0.9X -Native ORC Vectorized (Pushdown) 17437 17572 188 0.9 1108.6 0.9X +Parquet Vectorized 18246 18497 246 0.9 1160.1 1.0X +Parquet Vectorized (Pushdown) 18279 18441 122 0.9 1162.2 1.0X +Native ORC Vectorized 17839 17964 141 0.9 1134.2 1.0X +Native ORC Vectorized (Pushdown) 18524 18868 253 0.8 1177.8 1.0X ================================================================================================ @@ -229,31 +229,31 @@ Pushdown benchmark for StringStartsWith ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8868 8996 74 1.8 563.8 1.0X -Parquet Vectorized (Pushdown) 1235 1272 36 12.7 78.5 7.2X -Native ORC Vectorized 7295 7418 112 2.2 463.8 1.2X -Native ORC Vectorized (Pushdown) 7519 7620 82 2.1 478.0 1.2X +Parquet Vectorized 10031 10145 123 1.6 637.8 1.0X +Parquet Vectorized (Pushdown) 1475 1500 26 10.7 93.7 6.8X +Native ORC Vectorized 7489 7702 186 2.1 476.1 1.3X +Native ORC Vectorized (Pushdown) 7805 8007 182 2.0 496.3 1.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8767 8846 67 1.8 557.4 1.0X -Parquet Vectorized (Pushdown) 469 477 5 33.5 29.8 18.7X -Native ORC Vectorized 7156 7273 78 2.2 454.9 1.2X -Native ORC Vectorized (Pushdown) 7483 7494 10 2.1 475.7 1.2X +Parquet Vectorized 9632 9704 67 1.6 612.4 1.0X +Parquet Vectorized (Pushdown) 550 568 16 28.6 35.0 17.5X +Native ORC Vectorized 7269 7378 89 2.2 462.1 1.3X +Native ORC Vectorized (Pushdown) 7501 7574 82 2.1 476.9 1.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8672 8843 108 1.8 551.4 1.0X -Parquet Vectorized (Pushdown) 461 466 5 34.1 29.3 18.8X -Native ORC Vectorized 7029 7099 54 2.2 446.9 1.2X -Native ORC Vectorized (Pushdown) 7297 7374 63 2.2 464.0 1.2X +Parquet Vectorized 9727 9823 91 1.6 618.4 1.0X +Parquet Vectorized (Pushdown) 545 561 11 28.8 34.7 17.8X +Native ORC Vectorized 7255 7312 49 2.2 461.2 1.3X +Native ORC Vectorized (Pushdown) 7446 7635 200 2.1 473.4 1.3X ================================================================================================ @@ -261,31 +261,31 @@ Pushdown benchmark for StringEndsWith ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7611 7660 63 2.1 483.9 1.0X -Parquet Vectorized (Pushdown) 551 581 29 28.6 35.0 13.8X -Native ORC Vectorized 7859 7965 103 2.0 499.6 1.0X -Native ORC Vectorized (Pushdown) 8131 8226 63 1.9 516.9 0.9X +Parquet Vectorized 8598 8782 159 1.8 546.7 1.0X +Parquet Vectorized (Pushdown) 701 717 15 22.4 44.6 12.3X +Native ORC Vectorized 8186 8256 66 1.9 520.5 1.1X +Native ORC Vectorized (Pushdown) 8461 8575 80 1.9 538.0 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7386 7511 72 2.1 469.6 1.0X -Parquet Vectorized (Pushdown) 461 469 5 34.1 29.3 16.0X -Native ORC Vectorized 7813 7886 47 2.0 496.7 0.9X -Native ORC Vectorized (Pushdown) 7979 8222 312 2.0 507.3 0.9X +Parquet Vectorized 8504 8602 156 1.8 540.6 1.0X +Parquet Vectorized (Pushdown) 568 594 21 27.7 36.1 15.0X +Native ORC Vectorized 8087 8165 69 1.9 514.2 1.1X +Native ORC Vectorized (Pushdown) 8464 8546 93 1.9 538.2 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7521 7625 180 2.1 478.1 1.0X -Parquet Vectorized (Pushdown) 464 466 2 33.9 29.5 16.2X -Native ORC Vectorized 7780 7873 60 2.0 494.6 1.0X -Native ORC Vectorized (Pushdown) 8166 8205 46 1.9 519.2 0.9X +Parquet Vectorized 8492 8627 163 1.9 539.9 1.0X +Parquet Vectorized (Pushdown) 594 603 11 26.5 37.8 14.3X +Native ORC Vectorized 8075 8152 91 1.9 513.4 1.1X +Native ORC Vectorized (Pushdown) 8429 8677 184 1.9 535.9 1.0X ================================================================================================ @@ -293,31 +293,31 @@ Pushdown benchmark for StringContains ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7883 7893 16 2.0 501.2 1.0X -Parquet Vectorized (Pushdown) 1128 1132 7 13.9 71.7 7.0X -Native ORC Vectorized 8256 8280 27 1.9 524.9 1.0X -Native ORC Vectorized (Pushdown) 8522 8533 7 1.8 541.8 0.9X +Parquet Vectorized 8839 8943 114 1.8 562.0 1.0X +Parquet Vectorized (Pushdown) 1314 1329 12 12.0 83.5 6.7X +Native ORC Vectorized 8460 8553 69 1.9 537.9 1.0X +Native ORC Vectorized (Pushdown) 8662 8800 102 1.8 550.7 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7576 7581 7 2.1 481.7 1.0X -Parquet Vectorized (Pushdown) 468 477 8 33.6 29.8 16.2X -Native ORC Vectorized 7772 7899 99 2.0 494.1 1.0X -Native ORC Vectorized (Pushdown) 8069 8175 74 1.9 513.0 0.9X +Parquet Vectorized 8409 8546 112 1.9 534.6 1.0X +Parquet Vectorized (Pushdown) 538 566 18 29.3 34.2 15.6X +Native ORC Vectorized 7943 8061 106 2.0 505.0 1.1X +Native ORC Vectorized (Pushdown) 8450 8675 174 1.9 537.3 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7401 7461 36 2.1 470.5 1.0X -Parquet Vectorized (Pushdown) 464 471 5 33.9 29.5 16.0X -Native ORC Vectorized 7720 7812 90 2.0 490.9 1.0X -Native ORC Vectorized (Pushdown) 8031 8176 82 2.0 510.6 0.9X +Parquet Vectorized 8401 8562 182 1.9 534.1 1.0X +Parquet Vectorized (Pushdown) 558 582 36 28.2 35.5 15.1X +Native ORC Vectorized 8062 8157 96 2.0 512.6 1.0X +Native ORC Vectorized (Pushdown) 8307 8428 93 1.9 528.2 1.0X ================================================================================================ @@ -325,112 +325,112 @@ Pushdown benchmark for decimal ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3601 3623 25 4.4 229.0 1.0X -Parquet Vectorized (Pushdown) 118 122 5 132.9 7.5 30.4X -Native ORC Vectorized 4979 4997 26 3.2 316.6 0.7X -Native ORC Vectorized (Pushdown) 155 159 5 101.7 9.8 23.3X +Parquet Vectorized 4101 4158 32 3.8 260.7 1.0X +Parquet Vectorized (Pushdown) 130 139 8 121.3 8.2 31.6X +Native ORC Vectorized 5011 5072 70 3.1 318.6 0.8X +Native ORC Vectorized (Pushdown) 183 192 6 85.8 11.7 22.4X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5223 5232 9 3.0 332.0 1.0X -Parquet Vectorized (Pushdown) 2442 2453 13 6.4 155.3 2.1X -Native ORC Vectorized 6650 6651 2 2.4 422.8 0.8X -Native ORC Vectorized (Pushdown) 2777 2781 3 5.7 176.6 1.9X +Parquet Vectorized 5816 5953 135 2.7 369.7 1.0X +Parquet Vectorized (Pushdown) 2571 2634 85 6.1 163.4 2.3X +Native ORC Vectorized 6632 6767 86 2.4 421.6 0.9X +Native ORC Vectorized (Pushdown) 2765 2842 54 5.7 175.8 2.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10591 10617 33 1.5 673.3 1.0X -Parquet Vectorized (Pushdown) 10190 10200 8 1.5 647.9 1.0X -Native ORC Vectorized 12103 12126 16 1.3 769.5 0.9X -Native ORC Vectorized (Pushdown) 11524 11533 7 1.4 732.6 0.9X +Parquet Vectorized 11171 11263 158 1.4 710.2 1.0X +Parquet Vectorized (Pushdown) 10693 10906 194 1.5 679.9 1.0X +Native ORC Vectorized 12338 12951 544 1.3 784.5 0.9X +Native ORC Vectorized (Pushdown) 11749 11878 75 1.3 747.0 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11946 12068 197 1.3 759.5 1.0X -Parquet Vectorized (Pushdown) 12013 12030 15 1.3 763.7 1.0X -Native ORC Vectorized 13455 13478 19 1.2 855.5 0.9X -Native ORC Vectorized (Pushdown) 13553 13577 21 1.2 861.7 0.9X +Parquet Vectorized 12740 12872 185 1.2 810.0 1.0X +Parquet Vectorized (Pushdown) 12706 12928 130 1.2 807.8 1.0X +Native ORC Vectorized 13955 14143 150 1.1 887.2 0.9X +Native ORC Vectorized (Pushdown) 13893 14274 434 1.1 883.3 0.9X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3789 3802 14 4.2 240.9 1.0X -Parquet Vectorized (Pushdown) 118 123 5 133.3 7.5 32.1X -Native ORC Vectorized 4973 4985 25 3.2 316.2 0.8X -Native ORC Vectorized (Pushdown) 150 155 5 104.9 9.5 25.3X +Parquet Vectorized 4293 4339 40 3.7 273.0 1.0X +Parquet Vectorized (Pushdown) 136 148 6 116.0 8.6 31.7X +Native ORC Vectorized 5006 5173 113 3.1 318.3 0.9X +Native ORC Vectorized (Pushdown) 174 187 11 90.6 11.0 24.7X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4605 4623 20 3.4 292.8 1.0X -Parquet Vectorized (Pushdown) 1309 1320 20 12.0 83.2 3.5X -Native ORC Vectorized 5829 5836 8 2.7 370.6 0.8X -Native ORC Vectorized (Pushdown) 1468 1473 4 10.7 93.3 3.1X +Parquet Vectorized 5357 5437 70 2.9 340.6 1.0X +Parquet Vectorized (Pushdown) 1406 1444 36 11.2 89.4 3.8X +Native ORC Vectorized 5919 5977 57 2.7 376.3 0.9X +Native ORC Vectorized (Pushdown) 1490 1518 17 10.6 94.7 3.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7862 7871 8 2.0 499.8 1.0X -Parquet Vectorized (Pushdown) 6056 6064 9 2.6 385.0 1.3X -Native ORC Vectorized 9165 9179 11 1.7 582.7 0.9X -Native ORC Vectorized (Pushdown) 6774 6781 6 2.3 430.7 1.2X +Parquet Vectorized 8605 8935 268 1.8 547.1 1.0X +Parquet Vectorized (Pushdown) 6485 6558 62 2.4 412.3 1.3X +Native ORC Vectorized 9255 9393 101 1.7 588.4 0.9X +Native ORC Vectorized (Pushdown) 6975 7086 82 2.3 443.5 1.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11016 11026 8 1.4 700.4 1.0X -Parquet Vectorized (Pushdown) 10730 10744 12 1.5 682.2 1.0X -Native ORC Vectorized 12419 12427 9 1.3 789.6 0.9X -Native ORC Vectorized (Pushdown) 12015 12030 12 1.3 763.9 0.9X +Parquet Vectorized 11888 12166 209 1.3 755.8 1.0X +Parquet Vectorized (Pushdown) 11651 11763 143 1.4 740.7 1.0X +Native ORC Vectorized 12875 13100 202 1.2 818.6 0.9X +Native ORC Vectorized (Pushdown) 12226 12375 98 1.3 777.3 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5415 5429 17 2.9 344.3 1.0X -Parquet Vectorized (Pushdown) 127 132 5 123.9 8.1 42.7X -Native ORC Vectorized 4982 4998 14 3.2 316.7 1.1X -Native ORC Vectorized (Pushdown) 152 156 4 103.2 9.7 35.5X +Parquet Vectorized 6207 6501 189 2.5 394.7 1.0X +Parquet Vectorized (Pushdown) 140 154 8 112.0 8.9 44.2X +Native ORC Vectorized 4940 5018 57 3.2 314.1 1.3X +Native ORC Vectorized (Pushdown) 163 182 13 96.7 10.3 38.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6415 6445 31 2.5 407.9 1.0X -Parquet Vectorized (Pushdown) 1661 1663 3 9.5 105.6 3.9X -Native ORC Vectorized 5923 5933 7 2.7 376.6 1.1X -Native ORC Vectorized (Pushdown) 1593 1594 1 9.9 101.3 4.0X +Parquet Vectorized 7271 7460 198 2.2 462.3 1.0X +Parquet Vectorized (Pushdown) 1813 1863 43 8.7 115.3 4.0X +Native ORC Vectorized 5991 6109 105 2.6 380.9 1.2X +Native ORC Vectorized (Pushdown) 1632 1667 25 9.6 103.8 4.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10410 10426 16 1.5 661.9 1.0X -Parquet Vectorized (Pushdown) 7809 7819 9 2.0 496.5 1.3X -Native ORC Vectorized 9825 9834 9 1.6 624.6 1.1X -Native ORC Vectorized (Pushdown) 7419 7426 9 2.1 471.7 1.4X +Parquet Vectorized 11381 11538 95 1.4 723.6 1.0X +Parquet Vectorized (Pushdown) 8308 8395 77 1.9 528.2 1.4X +Native ORC Vectorized 10033 10136 66 1.6 637.9 1.1X +Native ORC Vectorized (Pushdown) 7558 7632 44 2.1 480.5 1.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14383 14398 16 1.1 914.5 1.0X -Parquet Vectorized (Pushdown) 13876 13900 14 1.1 882.2 1.0X -Native ORC Vectorized 13625 13638 12 1.2 866.2 1.1X -Native ORC Vectorized (Pushdown) 13194 13226 19 1.2 838.9 1.1X +Parquet Vectorized 15348 15725 247 1.0 975.8 1.0X +Parquet Vectorized (Pushdown) 14989 15656 537 1.0 953.0 1.0X +Native ORC Vectorized 13974 14162 165 1.1 888.4 1.1X +Native ORC Vectorized (Pushdown) 13661 13746 63 1.2 868.6 1.1X ================================================================================================ @@ -438,112 +438,112 @@ Pushdown benchmark for InSet -> InFilters ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8086 8129 75 1.9 514.1 1.0X -Parquet Vectorized (Pushdown) 472 484 9 33.3 30.0 17.1X -Native ORC Vectorized 6513 6533 32 2.4 414.1 1.2X -Native ORC Vectorized (Pushdown) 443 472 50 35.5 28.1 18.3X +Parquet Vectorized 8940 9114 120 1.8 568.4 1.0X +Parquet Vectorized (Pushdown) 564 595 31 27.9 35.8 15.9X +Native ORC Vectorized 6558 6635 64 2.4 417.0 1.4X +Native ORC Vectorized (Pushdown) 517 532 10 30.4 32.9 17.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8079 8103 22 1.9 513.6 1.0X -Parquet Vectorized (Pushdown) 468 476 6 33.6 29.7 17.3X -Native ORC Vectorized 6487 6495 5 2.4 412.4 1.2X -Native ORC Vectorized (Pushdown) 446 454 6 35.3 28.4 18.1X +Parquet Vectorized 8805 8989 133 1.8 559.8 1.0X +Parquet Vectorized (Pushdown) 562 579 16 28.0 35.7 15.7X +Native ORC Vectorized 6417 6562 110 2.5 408.0 1.4X +Native ORC Vectorized (Pushdown) 540 573 24 29.2 34.3 16.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8016 8028 8 2.0 509.7 1.0X -Parquet Vectorized (Pushdown) 473 477 5 33.3 30.1 17.0X -Native ORC Vectorized 6488 6492 3 2.4 412.5 1.2X -Native ORC Vectorized (Pushdown) 448 456 5 35.1 28.5 17.9X +Parquet Vectorized 9052 9188 92 1.7 575.5 1.0X +Parquet Vectorized (Pushdown) 580 604 15 27.1 36.8 15.6X +Native ORC Vectorized 6528 6596 91 2.4 415.1 1.4X +Native ORC Vectorized (Pushdown) 561 571 10 28.0 35.7 16.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8057 8068 8 2.0 512.2 1.0X -Parquet Vectorized (Pushdown) 484 491 5 32.5 30.8 16.7X -Native ORC Vectorized 6519 6525 5 2.4 414.5 1.2X -Native ORC Vectorized (Pushdown) 476 481 4 33.1 30.2 16.9X +Parquet Vectorized 8904 9049 119 1.8 566.1 1.0X +Parquet Vectorized (Pushdown) 593 614 21 26.5 37.7 15.0X +Native ORC Vectorized 6535 6601 60 2.4 415.5 1.4X +Native ORC Vectorized (Pushdown) 544 566 14 28.9 34.6 16.4X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8126 8147 20 1.9 516.6 1.0X -Parquet Vectorized (Pushdown) 495 501 5 31.8 31.5 16.4X -Native ORC Vectorized 6501 6508 8 2.4 413.3 1.2X -Native ORC Vectorized (Pushdown) 470 473 3 33.5 29.9 17.3X +Parquet Vectorized 8888 9037 130 1.8 565.1 1.0X +Parquet Vectorized (Pushdown) 585 621 31 26.9 37.2 15.2X +Native ORC Vectorized 6578 6614 40 2.4 418.2 1.4X +Native ORC Vectorized (Pushdown) 550 568 16 28.6 34.9 16.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8088 8096 6 1.9 514.2 1.0X -Parquet Vectorized (Pushdown) 498 500 2 31.6 31.7 16.2X -Native ORC Vectorized 6489 6499 8 2.4 412.6 1.2X -Native ORC Vectorized (Pushdown) 467 472 5 33.7 29.7 17.3X +Parquet Vectorized 8966 9164 164 1.8 570.0 1.0X +Parquet Vectorized (Pushdown) 596 616 19 26.4 37.9 15.0X +Native ORC Vectorized 6630 6661 58 2.4 421.5 1.4X +Native ORC Vectorized (Pushdown) 533 572 40 29.5 33.9 16.8X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8388 8396 7 1.9 533.3 1.0X -Parquet Vectorized (Pushdown) 1251 1259 7 12.6 79.5 6.7X -Native ORC Vectorized 6763 6768 3 2.3 430.0 1.2X -Native ORC Vectorized (Pushdown) 603 608 4 26.1 38.3 13.9X +Parquet Vectorized 9098 9204 96 1.7 578.4 1.0X +Parquet Vectorized (Pushdown) 1295 1311 9 12.1 82.3 7.0X +Native ORC Vectorized 6829 6875 40 2.3 434.2 1.3X +Native ORC Vectorized (Pushdown) 689 702 11 22.8 43.8 13.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8422 8459 25 1.9 535.5 1.0X -Parquet Vectorized (Pushdown) 4326 4329 3 3.6 275.0 1.9X -Native ORC Vectorized 6833 6839 4 2.3 434.4 1.2X -Native ORC Vectorized (Pushdown) 632 636 5 24.9 40.2 13.3X +Parquet Vectorized 9131 9294 137 1.7 580.5 1.0X +Parquet Vectorized (Pushdown) 4731 4793 42 3.3 300.8 1.9X +Native ORC Vectorized 6771 7002 161 2.3 430.5 1.3X +Native ORC Vectorized (Pushdown) 732 755 16 21.5 46.6 12.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8395 8399 3 1.9 533.8 1.0X -Parquet Vectorized (Pushdown) 7601 7780 162 2.1 483.3 1.1X -Native ORC Vectorized 6768 6777 9 2.3 430.3 1.2X -Native ORC Vectorized (Pushdown) 638 641 3 24.7 40.6 13.2X +Parquet Vectorized 9416 9534 102 1.7 598.7 1.0X +Parquet Vectorized (Pushdown) 8230 8347 76 1.9 523.3 1.1X +Native ORC Vectorized 6759 6910 101 2.3 429.7 1.4X +Native ORC Vectorized (Pushdown) 739 758 21 21.3 47.0 12.7X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8731 8759 22 1.8 555.1 1.0X -Parquet Vectorized (Pushdown) 1282 1286 4 12.3 81.5 6.8X -Native ORC Vectorized 6718 6762 38 2.3 427.1 1.3X -Native ORC Vectorized (Pushdown) 724 730 8 21.7 46.0 12.1X +Parquet Vectorized 9065 9179 86 1.7 576.3 1.0X +Parquet Vectorized (Pushdown) 1402 1441 30 11.2 89.1 6.5X +Native ORC Vectorized 6656 6920 179 2.4 423.2 1.4X +Native ORC Vectorized (Pushdown) 871 888 19 18.1 55.3 10.4X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8738 8745 5 1.8 555.6 1.0X -Parquet Vectorized (Pushdown) 4570 4588 12 3.4 290.5 1.9X -Native ORC Vectorized 6759 6792 21 2.3 429.7 1.3X -Native ORC Vectorized (Pushdown) 853 859 4 18.4 54.2 10.2X +Parquet Vectorized 9036 9109 59 1.7 574.5 1.0X +Parquet Vectorized (Pushdown) 4914 5102 132 3.2 312.4 1.8X +Native ORC Vectorized 6671 6835 103 2.4 424.1 1.4X +Native ORC Vectorized (Pushdown) 892 914 18 17.6 56.7 10.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8671 8681 7 1.8 551.3 1.0X -Parquet Vectorized (Pushdown) 7576 7581 6 2.1 481.7 1.1X -Native ORC Vectorized 6710 6720 8 2.3 426.6 1.3X -Native ORC Vectorized (Pushdown) 837 840 3 18.8 53.2 10.4X +Parquet Vectorized 9071 9236 148 1.7 576.7 1.0X +Parquet Vectorized (Pushdown) 8291 8363 82 1.9 527.1 1.1X +Native ORC Vectorized 6702 6763 57 2.3 426.1 1.4X +Native ORC Vectorized (Pushdown) 912 939 21 17.2 58.0 9.9X ================================================================================================ @@ -551,40 +551,40 @@ Pushdown benchmark for tinyint ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3958 3978 26 4.0 251.6 1.0X -Parquet Vectorized (Pushdown) 166 171 6 94.8 10.5 23.9X -Native ORC Vectorized 2992 3000 12 5.3 190.2 1.3X -Native ORC Vectorized (Pushdown) 205 208 3 76.7 13.0 19.3X +Parquet Vectorized 4501 4603 85 3.5 286.2 1.0X +Parquet Vectorized (Pushdown) 191 204 9 82.5 12.1 23.6X +Native ORC Vectorized 3012 3050 39 5.2 191.5 1.5X +Native ORC Vectorized (Pushdown) 239 246 7 65.8 15.2 18.8X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4705 4714 11 3.3 299.1 1.0X -Parquet Vectorized (Pushdown) 1235 1239 3 12.7 78.5 3.8X -Native ORC Vectorized 3685 3691 4 4.3 234.3 1.3X -Native ORC Vectorized (Pushdown) 1155 1165 10 13.6 73.4 4.1X +Parquet Vectorized 5123 5194 92 3.1 325.7 1.0X +Parquet Vectorized (Pushdown) 1324 1357 29 11.9 84.2 3.9X +Native ORC Vectorized 3750 3817 88 4.2 238.4 1.4X +Native ORC Vectorized (Pushdown) 1213 1256 34 13.0 77.1 4.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7695 7703 6 2.0 489.2 1.0X -Parquet Vectorized (Pushdown) 5820 5830 14 2.7 370.0 1.3X -Native ORC Vectorized 6567 6574 4 2.4 417.5 1.2X -Native ORC Vectorized (Pushdown) 5187 5193 5 3.0 329.8 1.5X +Parquet Vectorized 8231 8443 149 1.9 523.3 1.0X +Parquet Vectorized (Pushdown) 6239 6310 69 2.5 396.7 1.3X +Native ORC Vectorized 6820 6854 20 2.3 433.6 1.2X +Native ORC Vectorized (Pushdown) 5337 5457 137 2.9 339.3 1.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10714 10724 11 1.5 681.2 1.0X -Parquet Vectorized (Pushdown) 10379 10390 13 1.5 659.9 1.0X -Native ORC Vectorized 9434 9444 10 1.7 599.8 1.1X -Native ORC Vectorized (Pushdown) 9221 9242 15 1.7 586.3 1.2X +Parquet Vectorized 11484 11825 301 1.4 730.1 1.0X +Parquet Vectorized (Pushdown) 11051 11176 114 1.4 702.6 1.0X +Native ORC Vectorized 10438 10593 152 1.5 663.6 1.1X +Native ORC Vectorized (Pushdown) 10286 10635 210 1.5 654.0 1.1X ================================================================================================ @@ -592,112 +592,112 @@ Pushdown benchmark for Timestamp ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4149 4162 13 3.8 263.8 1.0X -Parquet Vectorized (Pushdown) 4141 4151 9 3.8 263.2 1.0X -Native ORC Vectorized 3080 3091 12 5.1 195.8 1.3X -Native ORC Vectorized (Pushdown) 123 128 4 127.4 7.9 33.6X +Parquet Vectorized 4712 4885 233 3.3 299.6 1.0X +Parquet Vectorized (Pushdown) 4776 4861 52 3.3 303.6 1.0X +Native ORC Vectorized 3160 3191 32 5.0 200.9 1.5X +Native ORC Vectorized (Pushdown) 138 152 10 114.0 8.8 34.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4902 4923 13 3.2 311.7 1.0X -Parquet Vectorized (Pushdown) 4894 4899 5 3.2 311.2 1.0X -Native ORC Vectorized 3851 3858 10 4.1 244.8 1.3X -Native ORC Vectorized (Pushdown) 1191 1194 4 13.2 75.7 4.1X +Parquet Vectorized 5432 5491 64 2.9 345.4 1.0X +Parquet Vectorized (Pushdown) 5438 5497 53 2.9 345.8 1.0X +Native ORC Vectorized 3945 4017 70 4.0 250.8 1.4X +Native ORC Vectorized (Pushdown) 1185 1223 34 13.3 75.3 4.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8112 8137 15 1.9 515.7 1.0X -Parquet Vectorized (Pushdown) 8092 8108 18 1.9 514.4 1.0X -Native ORC Vectorized 6838 6842 4 2.3 434.7 1.2X -Native ORC Vectorized (Pushdown) 5377 5381 4 2.9 341.8 1.5X +Parquet Vectorized 8704 8899 225 1.8 553.4 1.0X +Parquet Vectorized (Pushdown) 8722 8855 110 1.8 554.5 1.0X +Native ORC Vectorized 6944 7006 52 2.3 441.5 1.3X +Native ORC Vectorized (Pushdown) 5437 5556 110 2.9 345.6 1.6X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11222 11234 12 1.4 713.5 1.0X -Parquet Vectorized (Pushdown) 11234 11262 36 1.4 714.2 1.0X -Native ORC Vectorized 9837 9847 11 1.6 625.4 1.1X -Native ORC Vectorized (Pushdown) 9599 9604 5 1.6 610.3 1.2X +Parquet Vectorized 11861 12099 316 1.3 754.1 1.0X +Parquet Vectorized (Pushdown) 11869 12044 192 1.3 754.6 1.0X +Native ORC Vectorized 9796 10128 222 1.6 622.8 1.2X +Native ORC Vectorized (Pushdown) 9715 9798 124 1.6 617.6 1.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3663 3672 10 4.3 232.9 1.0X -Parquet Vectorized (Pushdown) 115 118 4 137.2 7.3 31.9X -Native ORC Vectorized 3079 3085 5 5.1 195.7 1.2X -Native ORC Vectorized (Pushdown) 122 125 4 128.5 7.8 29.9X +Parquet Vectorized 4193 4262 88 3.8 266.6 1.0X +Parquet Vectorized (Pushdown) 133 144 5 118.1 8.5 31.5X +Native ORC Vectorized 3211 3280 71 4.9 204.2 1.3X +Native ORC Vectorized (Pushdown) 137 149 10 114.5 8.7 30.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4466 4476 13 3.5 284.0 1.0X -Parquet Vectorized (Pushdown) 1266 1270 5 12.4 80.5 3.5X -Native ORC Vectorized 3846 3851 5 4.1 244.5 1.2X -Native ORC Vectorized (Pushdown) 1190 1193 3 13.2 75.6 3.8X +Parquet Vectorized 4946 5021 69 3.2 314.5 1.0X +Parquet Vectorized (Pushdown) 1342 1370 26 11.7 85.3 3.7X +Native ORC Vectorized 3936 4022 80 4.0 250.2 1.3X +Native ORC Vectorized (Pushdown) 1196 1226 37 13.1 76.1 4.1X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7632 7648 18 2.1 485.2 1.0X -Parquet Vectorized (Pushdown) 5900 5904 4 2.7 375.1 1.3X -Native ORC Vectorized 6850 6856 7 2.3 435.5 1.1X -Native ORC Vectorized (Pushdown) 5390 5396 7 2.9 342.7 1.4X +Parquet Vectorized 8263 8372 124 1.9 525.3 1.0X +Parquet Vectorized (Pushdown) 6199 6287 63 2.5 394.1 1.3X +Native ORC Vectorized 6922 7010 96 2.3 440.1 1.2X +Native ORC Vectorized (Pushdown) 5373 5525 150 2.9 341.6 1.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10772 10787 14 1.5 684.9 1.0X -Parquet Vectorized (Pushdown) 10425 10436 8 1.5 662.8 1.0X -Native ORC Vectorized 9836 9850 11 1.6 625.3 1.1X -Native ORC Vectorized (Pushdown) 9578 9583 5 1.6 609.0 1.1X +Parquet Vectorized 11404 11658 208 1.4 725.0 1.0X +Parquet Vectorized (Pushdown) 11109 11416 213 1.4 706.3 1.0X +Native ORC Vectorized 9997 10105 126 1.6 635.6 1.1X +Native ORC Vectorized (Pushdown) 9688 9745 70 1.6 615.9 1.2X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 3702 3713 8 4.2 235.4 1.0X -Parquet Vectorized (Pushdown) 114 117 3 138.1 7.2 32.5X -Native ORC Vectorized 3081 3089 8 5.1 195.9 1.2X -Native ORC Vectorized (Pushdown) 121 125 4 129.6 7.7 30.5X +Parquet Vectorized 4231 4250 22 3.7 269.0 1.0X +Parquet Vectorized (Pushdown) 127 141 6 123.4 8.1 33.2X +Native ORC Vectorized 3160 3214 48 5.0 200.9 1.3X +Native ORC Vectorized (Pushdown) 136 148 8 115.3 8.7 31.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4489 4494 9 3.5 285.4 1.0X -Parquet Vectorized (Pushdown) 1270 1282 14 12.4 80.7 3.5X -Native ORC Vectorized 3846 3856 7 4.1 244.5 1.2X -Native ORC Vectorized (Pushdown) 1187 1190 4 13.3 75.5 3.8X +Parquet Vectorized 5097 5245 86 3.1 324.1 1.0X +Parquet Vectorized (Pushdown) 1326 1355 36 11.9 84.3 3.8X +Native ORC Vectorized 3933 4015 73 4.0 250.1 1.3X +Native ORC Vectorized (Pushdown) 1184 1220 31 13.3 75.3 4.3X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7663 7679 17 2.1 487.2 1.0X -Parquet Vectorized (Pushdown) 5877 5891 11 2.7 373.6 1.3X -Native ORC Vectorized 6847 6850 3 2.3 435.3 1.1X -Native ORC Vectorized (Pushdown) 5373 5382 8 2.9 341.6 1.4X +Parquet Vectorized 8334 8468 95 1.9 529.9 1.0X +Parquet Vectorized (Pushdown) 6224 6382 151 2.5 395.7 1.3X +Native ORC Vectorized 6877 6965 63 2.3 437.2 1.2X +Native ORC Vectorized (Pushdown) 5513 5576 43 2.9 350.5 1.5X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10752 10862 206 1.5 683.6 1.0X -Parquet Vectorized (Pushdown) 10471 10479 7 1.5 665.7 1.0X -Native ORC Vectorized 9861 9868 9 1.6 627.0 1.1X -Native ORC Vectorized (Pushdown) 9604 9614 7 1.6 610.6 1.1X +Parquet Vectorized 11367 11586 137 1.4 722.7 1.0X +Parquet Vectorized (Pushdown) 11219 11614 252 1.4 713.3 1.0X +Native ORC Vectorized 9992 10096 75 1.6 635.3 1.1X +Native ORC Vectorized (Pushdown) 9754 9909 128 1.6 620.2 1.2X ================================================================================================ @@ -705,30 +705,30 @@ Pushdown benchmark with many filters ================================================================================================ OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 151 155 4 0.0 150695208.0 1.0X -Parquet Vectorized (Pushdown) 153 157 5 0.0 152853214.0 1.0X -Native ORC Vectorized 144 154 9 0.0 143923196.0 1.0X -Native ORC Vectorized (Pushdown) 152 159 8 0.0 152359467.0 1.0X +Parquet Vectorized 148 159 9 0.0 147975732.0 1.0X +Parquet Vectorized (Pushdown) 158 167 7 0.0 158049002.0 0.9X +Native ORC Vectorized 142 154 9 0.0 142429704.0 1.0X +Native ORC Vectorized (Pushdown) 151 165 9 0.0 151259900.0 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1582 1614 31 0.0 1581508551.0 1.0X -Parquet Vectorized (Pushdown) 1626 1664 73 0.0 1625795573.0 1.0X -Native ORC Vectorized 1563 1586 23 0.0 1563053391.0 1.0X -Native ORC Vectorized (Pushdown) 1583 1619 64 0.0 1583075187.0 1.0X +Parquet Vectorized 1625 1691 53 0.0 1624672023.0 1.0X +Parquet Vectorized (Pushdown) 1652 1729 100 0.0 1652020269.0 1.0X +Native ORC Vectorized 1556 1668 131 0.0 1556322718.0 1.0X +Native ORC Vectorized (Pushdown) 1601 1670 46 0.0 1601166707.0 1.0X OpenJDK 64-Bit Server VM 17.0.3+7-LTS on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7027 7227 148 0.0 7026812316.0 1.0X -Parquet Vectorized (Pushdown) 7197 7404 199 0.0 7197391646.0 1.0X -Native ORC Vectorized 7143 7258 117 0.0 7143461935.0 1.0X -Native ORC Vectorized (Pushdown) 7137 7323 228 0.0 7137286231.0 1.0X +Parquet Vectorized 6903 7162 290 0.0 6903444982.0 1.0X +Parquet Vectorized (Pushdown) 7036 7229 174 0.0 7036411944.0 1.0X +Native ORC Vectorized 7183 7386 198 0.0 7182804267.0 1.0X +Native ORC Vectorized (Pushdown) 6968 7222 232 0.0 6968043200.0 1.0X diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index 24e1594e56..3847f92076 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -3,166 +3,166 @@ Pushdown for many distinct value case ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10541 10920 716 1.5 670.2 1.0X -Parquet Vectorized (Pushdown) 616 639 29 25.5 39.2 17.1X -Native ORC Vectorized 6367 7100 1513 2.5 404.8 1.7X -Native ORC Vectorized (Pushdown) 523 557 47 30.1 33.3 20.1X +Parquet Vectorized 10502 11317 1140 1.5 667.7 1.0X +Parquet Vectorized (Pushdown) 788 833 32 20.0 50.1 13.3X +Native ORC Vectorized 7708 8902 926 2.0 490.1 1.4X +Native ORC Vectorized (Pushdown) 633 700 96 24.8 40.3 16.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 string row ('7864320' < value < '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10524 10666 134 1.5 669.1 1.0X -Parquet Vectorized (Pushdown) 585 609 20 26.9 37.2 18.0X -Native ORC Vectorized 6429 6511 77 2.4 408.7 1.6X -Native ORC Vectorized (Pushdown) 524 551 32 30.0 33.3 20.1X +Parquet Vectorized 10574 10751 229 1.5 672.3 1.0X +Parquet Vectorized (Pushdown) 740 765 38 21.3 47.0 14.3X +Native ORC Vectorized 7867 7920 60 2.0 500.2 1.3X +Native ORC Vectorized (Pushdown) 622 673 74 25.3 39.5 17.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 string row (value = '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10295 10478 207 1.5 654.6 1.0X -Parquet Vectorized (Pushdown) 553 583 20 28.4 35.2 18.6X -Native ORC Vectorized 6201 6450 178 2.5 394.2 1.7X -Native ORC Vectorized (Pushdown) 502 528 39 31.4 31.9 20.5X +Parquet Vectorized 10504 10582 60 1.5 667.8 1.0X +Parquet Vectorized (Pushdown) 731 748 22 21.5 46.5 14.4X +Native ORC Vectorized 7693 7792 92 2.0 489.1 1.4X +Native ORC Vectorized (Pushdown) 603 639 40 26.1 38.4 17.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 string row (value <=> '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10360 10471 112 1.5 658.7 1.0X -Parquet Vectorized (Pushdown) 550 571 17 28.6 35.0 18.8X -Native ORC Vectorized 6465 6472 8 2.4 411.0 1.6X -Native ORC Vectorized (Pushdown) 520 571 62 30.2 33.1 19.9X +Parquet Vectorized 10498 10644 159 1.5 667.4 1.0X +Parquet Vectorized (Pushdown) 718 749 21 21.9 45.6 14.6X +Native ORC Vectorized 7883 7990 117 2.0 501.2 1.3X +Native ORC Vectorized (Pushdown) 598 629 35 26.3 38.0 17.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 string row ('7864320' <= value <= '7864320'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10401 10562 106 1.5 661.3 1.0X -Parquet Vectorized (Pushdown) 544 568 24 28.9 34.6 19.1X -Native ORC Vectorized 6475 6544 65 2.4 411.7 1.6X -Native ORC Vectorized (Pushdown) 508 533 31 31.0 32.3 20.5X +Parquet Vectorized 10605 10620 17 1.5 674.2 1.0X +Parquet Vectorized (Pushdown) 721 754 33 21.8 45.9 14.7X +Native ORC Vectorized 7920 8092 106 2.0 503.5 1.3X +Native ORC Vectorized (Pushdown) 589 622 49 26.7 37.5 18.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select all string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 17904 18027 125 0.9 1138.3 1.0X -Parquet Vectorized (Pushdown) 17931 17979 48 0.9 1140.0 1.0X -Native ORC Vectorized 12862 13722 489 1.2 817.7 1.4X -Native ORC Vectorized (Pushdown) 14091 14156 55 1.1 895.9 1.3X +Parquet Vectorized 19184 19448 194 0.8 1219.7 1.0X +Parquet Vectorized (Pushdown) 19372 19480 109 0.8 1231.7 1.0X +Native ORC Vectorized 16930 17029 85 0.9 1076.4 1.1X +Native ORC Vectorized (Pushdown) 17083 17275 148 0.9 1086.1 1.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 int row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9693 10784 1430 1.6 616.2 1.0X -Parquet Vectorized (Pushdown) 538 555 20 29.2 34.2 18.0X -Native ORC Vectorized 5904 6535 1244 2.7 375.4 1.6X -Native ORC Vectorized (Pushdown) 484 531 67 32.5 30.8 20.0X +Parquet Vectorized 10012 11649 NaN 1.6 636.6 1.0X +Parquet Vectorized (Pushdown) 697 743 43 22.6 44.3 14.4X +Native ORC Vectorized 7143 8211 NaN 2.2 454.1 1.4X +Native ORC Vectorized (Pushdown) 572 599 51 27.5 36.4 17.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 int row (7864320 < value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9731 10023 498 1.6 618.7 1.0X -Parquet Vectorized (Pushdown) 549 584 33 28.7 34.9 17.7X -Native ORC Vectorized 5916 5945 19 2.7 376.2 1.6X -Native ORC Vectorized (Pushdown) 482 510 40 32.7 30.6 20.2X +Parquet Vectorized 9786 9824 29 1.6 622.2 1.0X +Parquet Vectorized (Pushdown) 698 719 18 22.5 44.4 14.0X +Native ORC Vectorized 7148 7207 55 2.2 454.5 1.4X +Native ORC Vectorized (Pushdown) 582 606 47 27.0 37.0 16.8X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 int row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9773 10040 468 1.6 621.4 1.0X -Parquet Vectorized (Pushdown) 546 550 6 28.8 34.7 17.9X -Native ORC Vectorized 5765 5958 110 2.7 366.5 1.7X -Native ORC Vectorized (Pushdown) 477 535 49 33.0 30.3 20.5X +Parquet Vectorized 9916 10022 108 1.6 630.4 1.0X +Parquet Vectorized (Pushdown) 720 729 7 21.9 45.8 13.8X +Native ORC Vectorized 7196 7237 33 2.2 457.5 1.4X +Native ORC Vectorized (Pushdown) 572 601 44 27.5 36.4 17.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 int row (value <=> 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9747 10046 494 1.6 619.7 1.0X -Parquet Vectorized (Pushdown) 541 551 16 29.1 34.4 18.0X -Native ORC Vectorized 5983 6001 17 2.6 380.4 1.6X -Native ORC Vectorized (Pushdown) 475 517 46 33.1 30.2 20.5X +Parquet Vectorized 9896 10040 144 1.6 629.2 1.0X +Parquet Vectorized (Pushdown) 699 715 22 22.5 44.4 14.2X +Native ORC Vectorized 7170 7190 21 2.2 455.9 1.4X +Native ORC Vectorized (Pushdown) 565 590 37 27.8 35.9 17.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 int row (7864320 <= value <= 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9721 9838 98 1.6 618.1 1.0X -Parquet Vectorized (Pushdown) 548 564 15 28.7 34.9 17.7X -Native ORC Vectorized 5975 5982 6 2.6 379.9 1.6X -Native ORC Vectorized (Pushdown) 479 500 40 32.9 30.4 20.3X +Parquet Vectorized 9961 10366 833 1.6 633.3 1.0X +Parquet Vectorized (Pushdown) 699 723 25 22.5 44.5 14.2X +Native ORC Vectorized 7241 7357 100 2.2 460.4 1.4X +Native ORC Vectorized (Pushdown) 576 614 54 27.3 36.6 17.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 int row (7864319 < value < 7864321): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9267 9697 243 1.7 589.2 1.0X -Parquet Vectorized (Pushdown) 543 556 15 28.9 34.5 17.1X -Native ORC Vectorized 5821 5963 85 2.7 370.1 1.6X -Native ORC Vectorized (Pushdown) 477 525 49 33.0 30.3 19.4X +Parquet Vectorized 9913 10057 108 1.6 630.2 1.0X +Parquet Vectorized (Pushdown) 710 734 23 22.2 45.1 14.0X +Native ORC Vectorized 7267 7383 93 2.2 462.0 1.4X +Native ORC Vectorized (Pushdown) 583 605 37 27.0 37.0 17.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% int rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 10522 10682 246 1.5 669.0 1.0X -Parquet Vectorized (Pushdown) 2111 2214 61 7.5 134.2 5.0X -Native ORC Vectorized 6657 6731 52 2.4 423.3 1.6X -Native ORC Vectorized (Pushdown) 1733 1813 72 9.1 110.2 6.1X +Parquet Vectorized 10884 10955 60 1.4 692.0 1.0X +Parquet Vectorized (Pushdown) 2565 2602 34 6.1 163.1 4.2X +Native ORC Vectorized 8160 8182 14 1.9 518.8 1.3X +Native ORC Vectorized (Pushdown) 2214 2246 32 7.1 140.8 4.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% int rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 13171 13221 75 1.2 837.4 1.0X -Parquet Vectorized (Pushdown) 8576 8601 17 1.8 545.2 1.5X -Native ORC Vectorized 8951 9351 230 1.8 569.1 1.5X -Native ORC Vectorized (Pushdown) 6567 6735 97 2.4 417.5 2.0X +Parquet Vectorized 14256 14384 134 1.1 906.4 1.0X +Parquet Vectorized (Pushdown) 9662 9761 95 1.6 614.3 1.5X +Native ORC Vectorized 11647 11695 38 1.4 740.5 1.2X +Native ORC Vectorized (Pushdown) 8361 8412 41 1.9 531.6 1.7X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% int rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 14256 15569 737 1.1 906.4 1.0X -Parquet Vectorized (Pushdown) 14932 14996 87 1.1 949.3 1.0X -Native ORC Vectorized 12178 12304 92 1.3 774.2 1.2X -Native ORC Vectorized (Pushdown) 11609 11719 94 1.4 738.1 1.2X +Parquet Vectorized 17686 17923 412 0.9 1124.5 1.0X +Parquet Vectorized (Pushdown) 16762 16826 62 0.9 1065.7 1.1X +Native ORC Vectorized 15005 15205 201 1.0 954.0 1.2X +Native ORC Vectorized (Pushdown) 14622 14727 72 1.1 929.6 1.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select all int rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16179 16414 134 1.0 1028.6 1.0X -Parquet Vectorized (Pushdown) 16469 16502 35 1.0 1047.0 1.0X -Native ORC Vectorized 12527 12717 170 1.3 796.5 1.3X -Native ORC Vectorized (Pushdown) 12259 12758 324 1.3 779.4 1.3X +Parquet Vectorized 18135 18382 153 0.9 1153.0 1.0X +Parquet Vectorized (Pushdown) 18395 18490 89 0.9 1169.5 1.0X +Native ORC Vectorized 15727 15782 38 1.0 999.9 1.2X +Native ORC Vectorized (Pushdown) 15910 15995 56 1.0 1011.6 1.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select all int rows (value > -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 16485 16519 32 1.0 1048.1 1.0X -Parquet Vectorized (Pushdown) 16313 16530 155 1.0 1037.1 1.0X -Native ORC Vectorized 12357 12592 147 1.3 785.6 1.3X -Native ORC Vectorized (Pushdown) 12212 12625 237 1.3 776.4 1.3X +Parquet Vectorized 18432 18504 58 0.9 1171.9 1.0X +Parquet Vectorized (Pushdown) 18403 18595 193 0.9 1170.0 1.0X +Native ORC Vectorized 15687 15849 112 1.0 997.3 1.2X +Native ORC Vectorized (Pushdown) 15988 16041 58 1.0 1016.5 1.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select all int rows (value != -1): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 14628 15501 735 1.1 930.0 1.0X -Parquet Vectorized (Pushdown) 16459 16488 34 1.0 1046.5 0.9X -Native ORC Vectorized 12628 12723 60 1.2 802.9 1.2X -Native ORC Vectorized (Pushdown) 11419 12132 626 1.4 726.0 1.3X +Parquet Vectorized 18451 18492 33 0.9 1173.1 1.0X +Parquet Vectorized (Pushdown) 18511 18576 48 0.8 1176.9 1.0X +Native ORC Vectorized 15835 15890 61 1.0 1006.8 1.2X +Native ORC Vectorized (Pushdown) 16154 16205 66 1.0 1027.0 1.1X ================================================================================================ @@ -170,58 +170,58 @@ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 distinct string row (value IS NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9249 9597 435 1.7 588.0 1.0X -Parquet Vectorized (Pushdown) 471 484 26 33.4 29.9 19.6X -Native ORC Vectorized 7615 8225 1196 2.1 484.1 1.2X -Native ORC Vectorized (Pushdown) 872 934 66 18.0 55.4 10.6X +Parquet Vectorized 9541 9928 737 1.6 606.6 1.0X +Parquet Vectorized (Pushdown) 622 649 18 25.3 39.6 15.3X +Native ORC Vectorized 9400 10301 NaN 1.7 597.7 1.0X +Native ORC Vectorized (Pushdown) 1049 1120 70 15.0 66.7 9.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 0 distinct string row ('100' < value < '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9078 9471 226 1.7 577.2 1.0X -Parquet Vectorized (Pushdown) 478 490 15 32.9 30.4 19.0X -Native ORC Vectorized 7763 7823 60 2.0 493.6 1.2X -Native ORC Vectorized (Pushdown) 816 892 77 19.3 51.9 11.1X +Parquet Vectorized 9712 9777 93 1.6 617.5 1.0X +Parquet Vectorized (Pushdown) 641 651 10 24.5 40.7 15.2X +Native ORC Vectorized 9698 9800 62 1.6 616.6 1.0X +Native ORC Vectorized (Pushdown) 1042 1089 51 15.1 66.3 9.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 distinct string row (value = '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8486 9028 394 1.9 539.5 1.0X -Parquet Vectorized (Pushdown) 549 563 13 28.6 34.9 15.4X -Native ORC Vectorized 7788 7844 80 2.0 495.1 1.1X -Native ORC Vectorized (Pushdown) 926 987 46 17.0 58.8 9.2X +Parquet Vectorized 9499 9571 41 1.7 604.0 1.0X +Parquet Vectorized (Pushdown) 689 705 25 22.8 43.8 13.8X +Native ORC Vectorized 9623 9708 65 1.6 611.8 1.0X +Native ORC Vectorized (Pushdown) 1104 1166 54 14.3 70.2 8.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 distinct string row (value <=> '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9384 10306 NaN 1.7 596.6 1.0X -Parquet Vectorized (Pushdown) 482 521 32 32.6 30.6 19.5X -Native ORC Vectorized 7479 7771 167 2.1 475.5 1.3X -Native ORC Vectorized (Pushdown) 922 946 38 17.1 58.6 10.2X +Parquet Vectorized 9591 9663 60 1.6 609.8 1.0X +Parquet Vectorized (Pushdown) 677 703 26 23.2 43.0 14.2X +Native ORC Vectorized 9687 9746 57 1.6 615.9 1.0X +Native ORC Vectorized (Pushdown) 1113 1166 59 14.1 70.8 8.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 distinct string row ('100' <= value <= '100'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8984 9473 301 1.8 571.2 1.0X -Parquet Vectorized (Pushdown) 492 541 29 32.0 31.3 18.3X -Native ORC Vectorized 7459 7812 198 2.1 474.2 1.2X -Native ORC Vectorized (Pushdown) 928 990 79 17.0 59.0 9.7X +Parquet Vectorized 9670 9765 132 1.6 614.8 1.0X +Parquet Vectorized (Pushdown) 690 715 15 22.8 43.9 14.0X +Native ORC Vectorized 9858 10009 105 1.6 626.7 1.0X +Native ORC Vectorized (Pushdown) 1156 1222 50 13.6 73.5 8.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select all distinct string rows (value IS NOT NULL): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 17691 17789 84 0.9 1124.7 1.0X -Parquet Vectorized (Pushdown) 17775 17863 72 0.9 1130.1 1.0X -Native ORC Vectorized 15730 15964 203 1.0 1000.1 1.1X -Native ORC Vectorized (Pushdown) 16405 16521 106 1.0 1043.0 1.1X +Parquet Vectorized 20113 20305 169 0.8 1278.7 1.0X +Parquet Vectorized (Pushdown) 20303 20457 243 0.8 1290.8 1.0X +Native ORC Vectorized 20123 20316 238 0.8 1279.4 1.0X +Native ORC Vectorized (Pushdown) 20551 20724 294 0.8 1306.6 1.0X ================================================================================================ @@ -229,31 +229,31 @@ Pushdown benchmark for StringStartsWith ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringStartsWith filter: (value like '10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11328 12986 1138 1.4 720.2 1.0X -Parquet Vectorized (Pushdown) 1315 1387 74 12.0 83.6 8.6X -Native ORC Vectorized 6337 7318 NaN 2.5 402.9 1.8X -Native ORC Vectorized (Pushdown) 6808 6869 89 2.3 432.8 1.7X +Parquet Vectorized 10856 11523 1197 1.4 690.2 1.0X +Parquet Vectorized (Pushdown) 1647 1666 20 9.6 104.7 6.6X +Native ORC Vectorized 8114 9119 253 1.9 515.9 1.3X +Native ORC Vectorized (Pushdown) 8227 8281 40 1.9 523.0 1.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringStartsWith filter: (value like '1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10598 10792 374 1.5 673.8 1.0X -Parquet Vectorized (Pushdown) 545 555 13 28.9 34.6 19.5X -Native ORC Vectorized 6241 6513 182 2.5 396.8 1.7X -Native ORC Vectorized (Pushdown) 6337 6632 176 2.5 402.9 1.7X +Parquet Vectorized 10649 10703 88 1.5 677.0 1.0X +Parquet Vectorized (Pushdown) 703 727 23 22.4 44.7 15.1X +Native ORC Vectorized 7858 7921 47 2.0 499.6 1.4X +Native ORC Vectorized (Pushdown) 8028 8100 47 2.0 510.4 1.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringStartsWith filter: (value like '786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9642 10349 602 1.6 613.0 1.0X -Parquet Vectorized (Pushdown) 482 547 46 32.6 30.6 20.0X -Native ORC Vectorized 6162 6470 193 2.6 391.8 1.6X -Native ORC Vectorized (Pushdown) 6634 6655 22 2.4 421.8 1.5X +Parquet Vectorized 10646 10700 48 1.5 676.9 1.0X +Parquet Vectorized (Pushdown) 712 722 11 22.1 45.3 14.9X +Native ORC Vectorized 7884 7928 42 2.0 501.2 1.4X +Native ORC Vectorized (Pushdown) 8074 8112 26 1.9 513.3 1.3X ================================================================================================ @@ -261,31 +261,31 @@ Pushdown benchmark for StringEndsWith ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringEndsWith filter: (value like '%10'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9713 10936 981 1.6 617.6 1.0X -Parquet Vectorized (Pushdown) 633 682 38 24.9 40.2 15.3X -Native ORC Vectorized 7075 8071 1484 2.2 449.8 1.4X -Native ORC Vectorized (Pushdown) 8120 8160 34 1.9 516.3 1.2X +Parquet Vectorized 9594 10683 NaN 1.6 609.9 1.0X +Parquet Vectorized (Pushdown) 830 851 19 18.9 52.8 11.6X +Native ORC Vectorized 9742 10597 NaN 1.6 619.4 1.0X +Native ORC Vectorized (Pushdown) 10152 10254 117 1.5 645.4 0.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringEndsWith filter: (value like '%1000'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8963 9371 232 1.8 569.8 1.0X -Parquet Vectorized (Pushdown) 539 554 18 29.2 34.3 16.6X -Native ORC Vectorized 7745 7802 45 2.0 492.4 1.2X -Native ORC Vectorized (Pushdown) 7912 8118 147 2.0 503.0 1.1X +Parquet Vectorized 9497 9558 71 1.7 603.8 1.0X +Parquet Vectorized (Pushdown) 680 696 11 23.1 43.2 14.0X +Native ORC Vectorized 9694 9809 121 1.6 616.3 1.0X +Native ORC Vectorized (Pushdown) 9951 10017 56 1.6 632.6 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringEndsWith filter: (value like '%786432'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9440 9500 65 1.7 600.2 1.0X -Parquet Vectorized (Pushdown) 538 549 20 29.2 34.2 17.6X -Native ORC Vectorized 7002 7473 303 2.2 445.2 1.3X -Native ORC Vectorized (Pushdown) 8052 8098 52 2.0 511.9 1.2X +Parquet Vectorized 9487 9535 46 1.7 603.2 1.0X +Parquet Vectorized (Pushdown) 671 692 21 23.4 42.7 14.1X +Native ORC Vectorized 9572 9676 62 1.6 608.6 1.0X +Native ORC Vectorized (Pushdown) 9948 10068 89 1.6 632.5 1.0X ================================================================================================ @@ -293,31 +293,31 @@ Pushdown benchmark for StringContains ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringContains filter: (value like '%10%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9782 10528 968 1.6 621.9 1.0X -Parquet Vectorized (Pushdown) 1297 1317 14 12.1 82.4 7.5X -Native ORC Vectorized 7995 8568 1153 2.0 508.3 1.2X -Native ORC Vectorized (Pushdown) 7814 8229 232 2.0 496.8 1.3X +Parquet Vectorized 9908 11133 NaN 1.6 629.9 1.0X +Parquet Vectorized (Pushdown) 1472 1500 19 10.7 93.6 6.7X +Native ORC Vectorized 9818 10701 NaN 1.6 624.2 1.0X +Native ORC Vectorized (Pushdown) 10246 10284 25 1.5 651.4 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringContains filter: (value like '%1000%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9408 9438 29 1.7 598.1 1.0X -Parquet Vectorized (Pushdown) 538 553 17 29.2 34.2 17.5X -Native ORC Vectorized 7779 7847 91 2.0 494.6 1.2X -Native ORC Vectorized (Pushdown) 7669 7970 183 2.1 487.6 1.2X +Parquet Vectorized 9526 9635 159 1.7 605.6 1.0X +Parquet Vectorized (Pushdown) 681 700 17 23.1 43.3 14.0X +Native ORC Vectorized 9551 9667 104 1.6 607.3 1.0X +Native ORC Vectorized (Pushdown) 10004 10078 72 1.6 636.1 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz StringContains filter: (value like '%786432%'): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 9413 9491 137 1.7 598.4 1.0X -Parquet Vectorized (Pushdown) 530 542 17 29.7 33.7 17.7X -Native ORC Vectorized 7626 7813 106 2.1 484.9 1.2X -Native ORC Vectorized (Pushdown) 8053 8139 105 2.0 512.0 1.2X +Parquet Vectorized 9561 9636 92 1.6 607.9 1.0X +Parquet Vectorized (Pushdown) 679 698 20 23.2 43.2 14.1X +Native ORC Vectorized 9657 9792 137 1.6 614.0 1.0X +Native ORC Vectorized (Pushdown) 10021 10114 60 1.6 637.1 1.0X ================================================================================================ @@ -325,112 +325,112 @@ Pushdown benchmark for decimal ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 decimal(9, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4796 4822 33 3.3 304.9 1.0X -Parquet Vectorized (Pushdown) 136 142 9 115.8 8.6 35.3X -Native ORC Vectorized 4364 4515 87 3.6 277.5 1.1X -Native ORC Vectorized (Pushdown) 167 178 20 94.2 10.6 28.7X +Parquet Vectorized 3998 4021 23 3.9 254.2 1.0X +Parquet Vectorized (Pushdown) 169 174 5 93.2 10.7 23.7X +Native ORC Vectorized 5463 5514 34 2.9 347.3 0.7X +Native ORC Vectorized (Pushdown) 191 216 30 82.3 12.2 20.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% decimal(9, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6219 6274 48 2.5 395.4 1.0X -Parquet Vectorized (Pushdown) 2259 2493 136 7.0 143.6 2.8X -Native ORC Vectorized 5867 5925 57 2.7 373.0 1.1X -Native ORC Vectorized (Pushdown) 2285 2437 130 6.9 145.3 2.7X +Parquet Vectorized 5787 5829 39 2.7 367.9 1.0X +Parquet Vectorized (Pushdown) 2743 2766 20 5.7 174.4 2.1X +Native ORC Vectorized 7322 7392 70 2.1 465.5 0.8X +Native ORC Vectorized (Pushdown) 3078 3106 26 5.1 195.7 1.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% decimal(9, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10201 10778 341 1.5 648.6 1.0X -Parquet Vectorized (Pushdown) 9422 9912 362 1.7 599.0 1.1X -Native ORC Vectorized 10277 10550 170 1.5 653.4 1.0X -Native ORC Vectorized (Pushdown) 9784 9985 161 1.6 622.1 1.0X +Parquet Vectorized 11833 11906 62 1.3 752.3 1.0X +Parquet Vectorized (Pushdown) 11326 11409 67 1.4 720.1 1.0X +Native ORC Vectorized 13206 13338 125 1.2 839.6 0.9X +Native ORC Vectorized (Pushdown) 12495 12648 172 1.3 794.4 0.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% decimal(9, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11899 12161 163 1.3 756.5 1.0X -Parquet Vectorized (Pushdown) 11348 12024 409 1.4 721.5 1.0X -Native ORC Vectorized 11676 11822 93 1.3 742.4 1.0X -Native ORC Vectorized (Pushdown) 11736 11847 90 1.3 746.2 1.0X +Parquet Vectorized 13289 13388 111 1.2 844.9 1.0X +Parquet Vectorized (Pushdown) 13238 13337 64 1.2 841.6 1.0X +Native ORC Vectorized 14867 14908 27 1.1 945.2 0.9X +Native ORC Vectorized (Pushdown) 14931 15020 54 1.1 949.3 0.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 decimal(18, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4986 4990 5 3.2 317.0 1.0X -Parquet Vectorized (Pushdown) 138 143 10 114.3 8.8 36.2X -Native ORC Vectorized 4586 4649 49 3.4 291.5 1.1X -Native ORC Vectorized (Pushdown) 163 176 27 96.3 10.4 30.5X +Parquet Vectorized 4239 4287 60 3.7 269.5 1.0X +Parquet Vectorized (Pushdown) 173 183 9 90.8 11.0 24.5X +Native ORC Vectorized 5507 5597 73 2.9 350.1 0.8X +Native ORC Vectorized (Pushdown) 199 217 30 79.0 12.7 21.3X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% decimal(18, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5753 5836 76 2.7 365.8 1.0X -Parquet Vectorized (Pushdown) 1358 1380 19 11.6 86.3 4.2X -Native ORC Vectorized 5117 5342 140 3.1 325.3 1.1X -Native ORC Vectorized (Pushdown) 1311 1337 36 12.0 83.3 4.4X +Parquet Vectorized 5118 5188 49 3.1 325.4 1.0X +Parquet Vectorized (Pushdown) 1446 1457 12 10.9 91.9 3.5X +Native ORC Vectorized 6354 6419 38 2.5 404.0 0.8X +Native ORC Vectorized (Pushdown) 1575 1607 35 10.0 100.1 3.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% decimal(18, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8633 8649 21 1.8 548.9 1.0X -Parquet Vectorized (Pushdown) 5565 5736 371 2.8 353.8 1.6X -Native ORC Vectorized 8083 8113 25 1.9 513.9 1.1X -Native ORC Vectorized (Pushdown) 5767 5968 114 2.7 366.7 1.5X +Parquet Vectorized 8554 8595 43 1.8 543.8 1.0X +Parquet Vectorized (Pushdown) 6550 6574 23 2.4 416.4 1.3X +Native ORC Vectorized 9786 9917 165 1.6 622.1 0.9X +Native ORC Vectorized (Pushdown) 7225 7317 147 2.2 459.4 1.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% decimal(18, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10562 11064 425 1.5 671.5 1.0X -Parquet Vectorized (Pushdown) 10224 10722 393 1.5 650.1 1.0X -Native ORC Vectorized 10843 10862 22 1.5 689.4 1.0X -Native ORC Vectorized (Pushdown) 10148 10381 177 1.5 645.2 1.0X +Parquet Vectorized 12187 12360 222 1.3 774.8 1.0X +Parquet Vectorized (Pushdown) 11563 11644 50 1.4 735.2 1.1X +Native ORC Vectorized 13166 13232 48 1.2 837.1 0.9X +Native ORC Vectorized (Pushdown) 12764 12870 85 1.2 811.5 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 decimal(38, 2) row (value = 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6258 6872 366 2.5 397.8 1.0X -Parquet Vectorized (Pushdown) 147 152 8 107.0 9.3 42.6X -Native ORC Vectorized 4590 4651 50 3.4 291.8 1.4X -Native ORC Vectorized (Pushdown) 152 179 22 103.3 9.7 41.1X +Parquet Vectorized 6582 6608 33 2.4 418.4 1.0X +Parquet Vectorized (Pushdown) 185 193 10 85.1 11.8 35.6X +Native ORC Vectorized 5489 5562 56 2.9 349.0 1.2X +Native ORC Vectorized (Pushdown) 190 204 24 82.8 12.1 34.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% decimal(38, 2) rows (value < 1572864): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 7711 7916 173 2.0 490.3 1.0X -Parquet Vectorized (Pushdown) 1751 1773 18 9.0 111.3 4.4X -Native ORC Vectorized 5327 5464 81 3.0 338.7 1.4X -Native ORC Vectorized (Pushdown) 1481 1499 36 10.6 94.1 5.2X +Parquet Vectorized 7761 7779 17 2.0 493.4 1.0X +Parquet Vectorized (Pushdown) 1998 2026 22 7.9 127.0 3.9X +Native ORC Vectorized 6693 6737 29 2.3 425.5 1.2X +Native ORC Vectorized (Pushdown) 1840 1857 16 8.5 117.0 4.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% decimal(38, 2) rows (value < 7864320): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11614 11665 56 1.4 738.4 1.0X -Parquet Vectorized (Pushdown) 7807 8159 237 2.0 496.3 1.5X -Native ORC Vectorized 8932 8998 129 1.8 567.9 1.3X -Native ORC Vectorized (Pushdown) 6776 6841 48 2.3 430.8 1.7X +Parquet Vectorized 12438 12467 26 1.3 790.8 1.0X +Parquet Vectorized (Pushdown) 9166 9227 85 1.7 582.7 1.4X +Native ORC Vectorized 11080 11218 121 1.4 704.5 1.1X +Native ORC Vectorized (Pushdown) 8415 8761 194 1.9 535.0 1.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% decimal(38, 2) rows (value < 14155776): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 14324 14958 451 1.1 910.7 1.0X -Parquet Vectorized (Pushdown) 14529 14614 64 1.1 923.7 1.0X -Native ORC Vectorized 12359 12471 113 1.3 785.8 1.2X -Native ORC Vectorized (Pushdown) 11961 12048 66 1.3 760.5 1.2X +Parquet Vectorized 16841 16919 56 0.9 1070.7 1.0X +Parquet Vectorized (Pushdown) 16293 16350 85 1.0 1035.9 1.0X +Native ORC Vectorized 15305 15360 45 1.0 973.0 1.1X +Native ORC Vectorized (Pushdown) 14809 14919 120 1.1 941.5 1.1X ================================================================================================ @@ -438,112 +438,112 @@ Pushdown benchmark for InSet -> InFilters ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9590 11618 NaN 1.6 609.7 1.0X -Parquet Vectorized (Pushdown) 558 569 12 28.2 35.5 17.2X -Native ORC Vectorized 5954 6612 1410 2.6 378.5 1.6X -Native ORC Vectorized (Pushdown) 441 485 53 35.6 28.1 21.7X +Parquet Vectorized 10982 12626 NaN 1.4 698.2 1.0X +Parquet Vectorized (Pushdown) 710 723 12 22.2 45.1 15.5X +Native ORC Vectorized 7411 8394 NaN 2.1 471.2 1.5X +Native ORC Vectorized (Pushdown) 582 652 123 27.0 37.0 18.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9258 9693 281 1.7 588.6 1.0X -Parquet Vectorized (Pushdown) 563 584 15 28.0 35.8 16.5X -Native ORC Vectorized 5926 5974 52 2.7 376.8 1.6X -Native ORC Vectorized (Pushdown) 486 523 47 32.4 30.9 19.0X +Parquet Vectorized 9926 10000 66 1.6 631.1 1.0X +Parquet Vectorized (Pushdown) 722 746 24 21.8 45.9 13.7X +Native ORC Vectorized 7407 7447 40 2.1 470.9 1.3X +Native ORC Vectorized (Pushdown) 570 597 41 27.6 36.3 17.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 5, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9787 9944 149 1.6 622.2 1.0X -Parquet Vectorized (Pushdown) 558 564 7 28.2 35.5 17.5X -Native ORC Vectorized 5954 6015 62 2.6 378.5 1.6X -Native ORC Vectorized (Pushdown) 485 524 45 32.4 30.8 20.2X +Parquet Vectorized 9939 9999 45 1.6 631.9 1.0X +Parquet Vectorized (Pushdown) 714 730 13 22.0 45.4 13.9X +Native ORC Vectorized 7448 7475 19 2.1 473.5 1.3X +Native ORC Vectorized (Pushdown) 586 613 47 26.8 37.3 16.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9798 10304 801 1.6 622.9 1.0X -Parquet Vectorized (Pushdown) 584 599 17 26.9 37.1 16.8X -Native ORC Vectorized 5969 5984 13 2.6 379.5 1.6X -Native ORC Vectorized (Pushdown) 501 519 36 31.4 31.8 19.6X +Parquet Vectorized 9975 10417 677 1.6 634.2 1.0X +Parquet Vectorized (Pushdown) 742 757 14 21.2 47.2 13.4X +Native ORC Vectorized 7407 7438 32 2.1 470.9 1.3X +Native ORC Vectorized (Pushdown) 593 623 43 26.5 37.7 16.8X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9452 9777 202 1.7 600.9 1.0X -Parquet Vectorized (Pushdown) 581 595 17 27.1 37.0 16.3X -Native ORC Vectorized 5968 5988 36 2.6 379.4 1.6X -Native ORC Vectorized (Pushdown) 504 522 32 31.2 32.1 18.7X +Parquet Vectorized 9976 9990 9 1.6 634.3 1.0X +Parquet Vectorized (Pushdown) 741 751 11 21.2 47.1 13.5X +Native ORC Vectorized 7470 7499 24 2.1 474.9 1.3X +Native ORC Vectorized (Pushdown) 599 630 44 26.3 38.1 16.7X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 10, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9607 9792 170 1.6 610.8 1.0X -Parquet Vectorized (Pushdown) 527 573 45 29.8 33.5 18.2X -Native ORC Vectorized 5851 6087 133 2.7 372.0 1.6X -Native ORC Vectorized (Pushdown) 500 521 38 31.4 31.8 19.2X +Parquet Vectorized 9932 9998 88 1.6 631.4 1.0X +Parquet Vectorized (Pushdown) 750 773 28 21.0 47.7 13.2X +Native ORC Vectorized 7439 7472 26 2.1 473.0 1.3X +Native ORC Vectorized (Pushdown) 600 632 44 26.2 38.2 16.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9469 9996 298 1.7 602.0 1.0X -Parquet Vectorized (Pushdown) 1480 1493 14 10.6 94.1 6.4X -Native ORC Vectorized 6265 6278 17 2.5 398.3 1.5X -Native ORC Vectorized (Pushdown) 623 691 38 25.2 39.6 15.2X +Parquet Vectorized 10365 10383 15 1.5 659.0 1.0X +Parquet Vectorized (Pushdown) 1592 1622 25 9.9 101.2 6.5X +Native ORC Vectorized 7804 7869 42 2.0 496.2 1.3X +Native ORC Vectorized (Pushdown) 754 775 36 20.9 47.9 13.8X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9566 10022 334 1.6 608.2 1.0X -Parquet Vectorized (Pushdown) 4660 5049 224 3.4 296.3 2.1X -Native ORC Vectorized 6267 6303 50 2.5 398.5 1.5X -Native ORC Vectorized (Pushdown) 656 704 42 24.0 41.7 14.6X +Parquet Vectorized 10332 10393 46 1.5 656.9 1.0X +Parquet Vectorized (Pushdown) 5252 5278 24 3.0 333.9 2.0X +Native ORC Vectorized 7806 7833 29 2.0 496.3 1.3X +Native ORC Vectorized (Pushdown) 781 824 56 20.1 49.7 13.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 50, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9321 9914 371 1.7 592.6 1.0X -Parquet Vectorized (Pushdown) 8505 8702 118 1.8 540.7 1.1X -Native ORC Vectorized 6089 6240 85 2.6 387.1 1.5X -Native ORC Vectorized (Pushdown) 654 695 37 24.0 41.6 14.3X +Parquet Vectorized 10370 10411 50 1.5 659.3 1.0X +Parquet Vectorized (Pushdown) 8955 9064 104 1.8 569.4 1.2X +Native ORC Vectorized 7785 7830 41 2.0 495.0 1.3X +Native ORC Vectorized (Pushdown) 798 838 50 19.7 50.7 13.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 10): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9772 9993 127 1.6 621.3 1.0X -Parquet Vectorized (Pushdown) 1345 1466 122 11.7 85.5 7.3X -Native ORC Vectorized 6200 6267 103 2.5 394.2 1.6X -Native ORC Vectorized (Pushdown) 744 783 33 21.1 47.3 13.1X +Parquet Vectorized 10253 10299 61 1.5 651.9 1.0X +Parquet Vectorized (Pushdown) 1626 1652 49 9.7 103.4 6.3X +Native ORC Vectorized 7725 7742 15 2.0 491.2 1.3X +Native ORC Vectorized (Pushdown) 931 970 43 16.9 59.2 11.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 50): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9880 10074 152 1.6 628.1 1.0X -Parquet Vectorized (Pushdown) 4670 5085 251 3.4 296.9 2.1X -Native ORC Vectorized 5895 6129 131 2.7 374.8 1.7X -Native ORC Vectorized (Pushdown) 849 934 68 18.5 54.0 11.6X +Parquet Vectorized 10240 10277 40 1.5 651.0 1.0X +Parquet Vectorized (Pushdown) 5397 5467 50 2.9 343.1 1.9X +Native ORC Vectorized 7690 7785 96 2.0 488.9 1.3X +Native ORC Vectorized (Pushdown) 981 1036 67 16.0 62.4 10.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz InSet -> InFilters (values count: 100, distribution: 90): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9556 9926 213 1.6 607.6 1.0X -Parquet Vectorized (Pushdown) 8856 8905 61 1.8 563.0 1.1X -Native ORC Vectorized 6137 6173 51 2.6 390.2 1.6X -Native ORC Vectorized (Pushdown) 836 898 49 18.8 53.2 11.4X +Parquet Vectorized 10299 10383 62 1.5 654.8 1.0X +Parquet Vectorized (Pushdown) 9189 9226 47 1.7 584.2 1.1X +Native ORC Vectorized 7669 7699 36 2.1 487.6 1.3X +Native ORC Vectorized (Pushdown) 1042 1075 28 15.1 66.2 9.9X ================================================================================================ @@ -551,40 +551,40 @@ Pushdown benchmark for tinyint ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5197 5225 33 3.0 330.4 1.0X -Parquet Vectorized (Pushdown) 190 195 7 82.8 12.1 27.3X -Native ORC Vectorized 2550 2585 40 6.2 162.2 2.0X -Native ORC Vectorized (Pushdown) 212 228 27 74.3 13.5 24.5X +Parquet Vectorized 4419 4480 45 3.6 280.9 1.0X +Parquet Vectorized (Pushdown) 213 230 13 73.7 13.6 20.7X +Native ORC Vectorized 3366 3421 82 4.7 214.0 1.3X +Native ORC Vectorized (Pushdown) 254 277 27 61.8 16.2 17.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5806 5866 58 2.7 369.1 1.0X -Parquet Vectorized (Pushdown) 1254 1316 53 12.5 79.7 4.6X -Native ORC Vectorized 3211 3215 4 4.9 204.1 1.8X -Native ORC Vectorized (Pushdown) 1062 1071 12 14.8 67.5 5.5X +Parquet Vectorized 5268 5337 57 3.0 334.9 1.0X +Parquet Vectorized (Pushdown) 1403 1424 14 11.2 89.2 3.8X +Native ORC Vectorized 4154 4173 14 3.8 264.1 1.3X +Native ORC Vectorized (Pushdown) 1296 1325 27 12.1 82.4 4.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8326 8498 123 1.9 529.3 1.0X -Parquet Vectorized (Pushdown) 6037 6106 66 2.6 383.8 1.4X -Native ORC Vectorized 5724 5796 45 2.7 363.9 1.5X -Native ORC Vectorized (Pushdown) 4638 4652 19 3.4 294.9 1.8X +Parquet Vectorized 8543 8585 51 1.8 543.1 1.0X +Parquet Vectorized (Pushdown) 6342 6434 72 2.5 403.2 1.3X +Native ORC Vectorized 7181 7294 105 2.2 456.5 1.2X +Native ORC Vectorized (Pushdown) 5652 5699 33 2.8 359.4 1.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 10622 11327 418 1.5 675.3 1.0X -Parquet Vectorized (Pushdown) 10144 10719 333 1.6 644.9 1.0X -Native ORC Vectorized 7425 8222 463 2.1 472.1 1.4X -Native ORC Vectorized (Pushdown) 7305 8035 409 2.2 464.5 1.5X +Parquet Vectorized 11786 11889 84 1.3 749.3 1.0X +Parquet Vectorized (Pushdown) 11436 11504 56 1.4 727.1 1.0X +Native ORC Vectorized 10368 10447 71 1.5 659.2 1.1X +Native ORC Vectorized (Pushdown) 10027 10120 97 1.6 637.5 1.2X ================================================================================================ @@ -592,112 +592,112 @@ Pushdown benchmark for Timestamp ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 timestamp stored as INT96 row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5508 5573 69 2.9 350.2 1.0X -Parquet Vectorized (Pushdown) 5497 5544 89 2.9 349.5 1.0X -Native ORC Vectorized 2420 2525 131 6.5 153.8 2.3X -Native ORC Vectorized (Pushdown) 137 144 15 115.1 8.7 40.3X +Parquet Vectorized 4805 5009 148 3.3 305.5 1.0X +Parquet Vectorized (Pushdown) 4793 4866 92 3.3 304.7 1.0X +Native ORC Vectorized 3563 3621 60 4.4 226.5 1.3X +Native ORC Vectorized (Pushdown) 168 182 21 93.6 10.7 28.6X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% timestamp stored as INT96 rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 6255 6330 100 2.5 397.7 1.0X -Parquet Vectorized (Pushdown) 6170 6252 61 2.5 392.3 1.0X -Native ORC Vectorized 3365 3374 7 4.7 213.9 1.9X -Native ORC Vectorized (Pushdown) 959 976 13 16.4 61.0 6.5X +Parquet Vectorized 5749 5859 82 2.7 365.5 1.0X +Parquet Vectorized (Pushdown) 5784 5836 52 2.7 367.8 1.0X +Native ORC Vectorized 4479 4610 118 3.5 284.8 1.3X +Native ORC Vectorized (Pushdown) 1387 1425 38 11.3 88.2 4.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% timestamp stored as INT96 rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 9044 9134 59 1.7 575.0 1.0X -Parquet Vectorized (Pushdown) 8816 8965 146 1.8 560.5 1.0X -Native ORC Vectorized 6038 6053 14 2.6 383.9 1.5X -Native ORC Vectorized (Pushdown) 4790 4810 16 3.3 304.6 1.9X +Parquet Vectorized 9379 9600 243 1.7 596.3 1.0X +Parquet Vectorized (Pushdown) 9300 9354 61 1.7 591.3 1.0X +Native ORC Vectorized 7820 7878 47 2.0 497.2 1.2X +Native ORC Vectorized (Pushdown) 6117 6195 81 2.6 388.9 1.5X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% timestamp stored as INT96 rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11786 11979 151 1.3 749.3 1.0X -Parquet Vectorized (Pushdown) 11463 11795 225 1.4 728.8 1.0X -Native ORC Vectorized 8459 8709 156 1.9 537.8 1.4X -Native ORC Vectorized (Pushdown) 7979 8447 425 2.0 507.3 1.5X +Parquet Vectorized 12792 12866 104 1.2 813.3 1.0X +Parquet Vectorized (Pushdown) 12888 12933 47 1.2 819.4 1.0X +Native ORC Vectorized 11128 11199 73 1.4 707.5 1.1X +Native ORC Vectorized (Pushdown) 10847 10970 93 1.4 689.7 1.2X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4436 4800 228 3.5 282.1 1.0X -Parquet Vectorized (Pushdown) 137 143 7 114.5 8.7 32.3X -Native ORC Vectorized 2676 2688 15 5.9 170.2 1.7X -Native ORC Vectorized (Pushdown) 134 143 23 117.0 8.5 33.0X +Parquet Vectorized 4074 4106 25 3.9 259.0 1.0X +Parquet Vectorized (Pushdown) 168 180 11 93.8 10.7 24.3X +Native ORC Vectorized 3558 3648 125 4.4 226.2 1.1X +Native ORC Vectorized (Pushdown) 158 171 24 99.8 10.0 25.9X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5606 5657 60 2.8 356.5 1.0X -Parquet Vectorized (Pushdown) 1334 1349 23 11.8 84.8 4.2X -Native ORC Vectorized 3373 3408 62 4.7 214.5 1.7X -Native ORC Vectorized (Pushdown) 1076 1110 33 14.6 68.4 5.2X +Parquet Vectorized 5155 5227 87 3.1 327.7 1.0X +Parquet Vectorized (Pushdown) 1484 1541 34 10.6 94.4 3.5X +Native ORC Vectorized 4413 4521 102 3.6 280.6 1.2X +Native ORC Vectorized (Pushdown) 1353 1383 31 11.6 86.0 3.8X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8446 8493 50 1.9 537.0 1.0X -Parquet Vectorized (Pushdown) 6001 6108 67 2.6 381.5 1.4X -Native ORC Vectorized 6034 6082 37 2.6 383.7 1.4X -Native ORC Vectorized (Pushdown) 4791 4806 13 3.3 304.6 1.8X +Parquet Vectorized 8683 8998 298 1.8 552.0 1.0X +Parquet Vectorized (Pushdown) 6702 6782 62 2.3 426.1 1.3X +Native ORC Vectorized 7854 7913 77 2.0 499.4 1.1X +Native ORC Vectorized (Pushdown) 6156 6200 50 2.6 391.4 1.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11219 11315 84 1.4 713.3 1.0X -Parquet Vectorized (Pushdown) 10295 10716 264 1.5 654.5 1.1X -Native ORC Vectorized 8693 8798 112 1.8 552.7 1.3X -Native ORC Vectorized (Pushdown) 7998 8362 239 2.0 508.5 1.4X +Parquet Vectorized 12197 12242 78 1.3 775.4 1.0X +Parquet Vectorized (Pushdown) 11837 11882 48 1.3 752.6 1.0X +Native ORC Vectorized 11076 11127 39 1.4 704.2 1.1X +Native ORC Vectorized (Pushdown) 10880 10982 71 1.4 691.7 1.1X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 4622 5010 228 3.4 293.8 1.0X -Parquet Vectorized (Pushdown) 121 136 15 129.7 7.7 38.1X -Native ORC Vectorized 2395 2602 118 6.6 152.3 1.9X -Native ORC Vectorized (Pushdown) 133 141 21 118.2 8.5 34.7X +Parquet Vectorized 4260 4291 24 3.7 270.8 1.0X +Parquet Vectorized (Pushdown) 172 182 11 91.5 10.9 24.8X +Native ORC Vectorized 3573 3596 19 4.4 227.2 1.2X +Native ORC Vectorized (Pushdown) 158 171 26 99.7 10.0 27.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(1572864)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 5694 5797 68 2.8 362.0 1.0X -Parquet Vectorized (Pushdown) 1296 1338 26 12.1 82.4 4.4X -Native ORC Vectorized 3367 3408 52 4.7 214.1 1.7X -Native ORC Vectorized (Pushdown) 960 1047 58 16.4 61.0 5.9X +Parquet Vectorized 5203 5226 17 3.0 330.8 1.0X +Parquet Vectorized (Pushdown) 1461 1501 32 10.8 92.9 3.6X +Native ORC Vectorized 4425 4467 31 3.6 281.3 1.2X +Native ORC Vectorized (Pushdown) 1383 1407 27 11.4 87.9 3.8X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(7864320)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 8593 8688 77 1.8 546.3 1.0X -Parquet Vectorized (Pushdown) 6022 6181 132 2.6 382.9 1.4X -Native ORC Vectorized 5730 6013 195 2.7 364.3 1.5X -Native ORC Vectorized (Pushdown) 4636 4813 103 3.4 294.8 1.9X +Parquet Vectorized 8798 8880 48 1.8 559.3 1.0X +Parquet Vectorized (Pushdown) 6713 6753 41 2.3 426.8 1.3X +Native ORC Vectorized 7849 7872 23 2.0 499.0 1.1X +Native ORC Vectorized (Pushdown) 6166 6212 41 2.6 392.0 1.4X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < timestamp_seconds(14155776)): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Parquet Vectorized 11111 11267 98 1.4 706.4 1.0X -Parquet Vectorized (Pushdown) 10828 10901 83 1.5 688.4 1.0X -Native ORC Vectorized 7966 8554 377 2.0 506.5 1.4X -Native ORC Vectorized (Pushdown) 8306 8453 131 1.9 528.1 1.3X +Parquet Vectorized 12306 12334 22 1.3 782.4 1.0X +Parquet Vectorized (Pushdown) 11866 12005 86 1.3 754.4 1.0X +Native ORC Vectorized 11187 11257 82 1.4 711.2 1.1X +Native ORC Vectorized (Pushdown) 10931 10987 41 1.4 695.0 1.1X ================================================================================================ @@ -705,30 +705,30 @@ Pushdown benchmark with many filters ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 row with 1 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 165 181 9 0.0 165274170.0 1.0X -Parquet Vectorized (Pushdown) 182 193 22 0.0 182084552.0 0.9X -Native ORC Vectorized 154 169 10 0.0 153949658.0 1.1X -Native ORC Vectorized (Pushdown) 183 188 5 0.0 183334682.0 0.9X +Parquet Vectorized 218 224 9 0.0 217684709.0 1.0X +Parquet Vectorized (Pushdown) 219 233 21 0.0 219330065.0 1.0X +Native ORC Vectorized 205 219 13 0.0 205378141.0 1.1X +Native ORC Vectorized (Pushdown) 219 224 3 0.0 218671576.0 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 row with 250 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 1655 2069 688 0.0 1655292270.0 1.0X -Parquet Vectorized (Pushdown) 1910 1918 9 0.0 1909884497.0 0.9X -Native ORC Vectorized 1848 1889 41 0.0 1847853824.0 0.9X -Native ORC Vectorized (Pushdown) 1862 1868 5 0.0 1861974825.0 0.9X +Parquet Vectorized 2348 2380 30 0.0 2347543709.0 1.0X +Parquet Vectorized (Pushdown) 2441 2446 5 0.0 2441178809.0 1.0X +Native ORC Vectorized 2351 2383 39 0.0 2351361923.0 1.0X +Native ORC Vectorized (Pushdown) 2371 2384 14 0.0 2371040396.0 1.0X OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Select 1 row with 500 filters: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Parquet Vectorized 7721 8802 1054 0.0 7720771648.0 1.0X -Parquet Vectorized (Pushdown) 8444 8628 173 0.0 8443708092.0 0.9X -Native ORC Vectorized 7854 8339 283 0.0 7854189202.0 1.0X -Native ORC Vectorized (Pushdown) 7812 8325 310 0.0 7811643781.0 1.0X +Parquet Vectorized 10400 10676 276 0.0 10399826445.0 1.0X +Parquet Vectorized (Pushdown) 10666 10734 74 0.0 10666230253.0 1.0X +Native ORC Vectorized 10381 10484 86 0.0 10381183138.0 1.0X +Native ORC Vectorized (Pushdown) 10376 10498 85 0.0 10375745359.0 1.0X diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index da6896f26a..f93419412e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -21,11 +21,11 @@ import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long import java.math.{BigDecimal => JBigDecimal} import java.sql.{Date, Timestamp} import java.time.{Duration, Instant, LocalDate, Period} +import java.util.HashSet import java.util.Locale import scala.collection.JavaConverters.asScalaBufferConverter -import org.apache.parquet.column.statistics.{Statistics => ParquetStatistics} import org.apache.parquet.filter2.predicate._ import org.apache.parquet.filter2.predicate.SparkFilterApi._ import org.apache.parquet.io.api.Binary @@ -444,94 +444,106 @@ class ParquetFilters( } private val makeInPredicate: - PartialFunction[ParquetSchemaType, - (Array[String], Array[Any], ParquetStatistics[_]) => FilterPredicate] = { + PartialFunction[ParquetSchemaType, (Array[String], Array[Any]) => FilterPredicate] = { + case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(toIntValue(_).toInt).foreach(statistics.updateStats) - FilterApi.and( - FilterApi.gtEq(intColumn(n), statistics.genericGetMin().asInstanceOf[Integer]), - FilterApi.ltEq(intColumn(n), statistics.genericGetMax().asInstanceOf[Integer])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Integer]() + for (value <- values) { + set.add(toIntValue(value)) + } + FilterApi.in(intColumn(n), set) case ParquetLongType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(toLongValue).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(longColumn(n), statistics.genericGetMin().asInstanceOf[JLong]), - FilterApi.ltEq(longColumn(n), statistics.genericGetMax().asInstanceOf[JLong])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JLong]() + for (value <- values) { + set.add(toLongValue(value)) + } + FilterApi.in(longColumn(n), set) case ParquetFloatType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(_.asInstanceOf[JFloat]).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(floatColumn(n), statistics.genericGetMin().asInstanceOf[JFloat]), - FilterApi.ltEq(floatColumn(n), statistics.genericGetMax().asInstanceOf[JFloat])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JFloat]() + for (value <- values) { + set.add(value.asInstanceOf[JFloat]) + } + FilterApi.in(floatColumn(n), set) case ParquetDoubleType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(_.asInstanceOf[JDouble]).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(doubleColumn(n), statistics.genericGetMin().asInstanceOf[JDouble]), - FilterApi.ltEq(doubleColumn(n), statistics.genericGetMax().asInstanceOf[JDouble])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JDouble]() + for (value <- values) { + set.add(value.asInstanceOf[JDouble]) + } + FilterApi.in(doubleColumn(n), set) case ParquetStringType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(s => Binary.fromString(s.asInstanceOf[String])).foreach(statistics.updateStats) - FilterApi.and( - FilterApi.gtEq(binaryColumn(n), statistics.genericGetMin().asInstanceOf[Binary]), - FilterApi.ltEq(binaryColumn(n), statistics.genericGetMax().asInstanceOf[Binary])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Binary]() + for (value <- values) { + set.add(Option(value).map(s => Binary.fromString(s.asInstanceOf[String])).orNull) + } + FilterApi.in(binaryColumn(n), set) case ParquetBinaryType => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(b => Binary.fromReusedByteArray(b.asInstanceOf[Array[Byte]])) - .foreach(statistics.updateStats) - FilterApi.and( - FilterApi.gtEq(binaryColumn(n), statistics.genericGetMin().asInstanceOf[Binary]), - FilterApi.ltEq(binaryColumn(n), statistics.genericGetMax().asInstanceOf[Binary])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Binary]() + for (value <- values) { + set.add(Option(value) + .map(b => Binary.fromReusedByteArray(b.asInstanceOf[Array[Byte]])).orNull) + } + FilterApi.in(binaryColumn(n), set) case ParquetDateType if pushDownDate => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(dateToDays).map(_.asInstanceOf[Integer]).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(intColumn(n), statistics.genericGetMin().asInstanceOf[Integer]), - FilterApi.ltEq(intColumn(n), statistics.genericGetMax().asInstanceOf[Integer])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Integer]() + for (value <- values) { + set.add(Option(value).map(date => dateToDays(date).asInstanceOf[Integer]).orNull) + } + FilterApi.in(intColumn(n), set) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(timestampToMicros).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(longColumn(n), statistics.genericGetMin().asInstanceOf[JLong]), - FilterApi.ltEq(longColumn(n), statistics.genericGetMax().asInstanceOf[JLong])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JLong]() + for (value <- values) { + set.add(Option(value).map(timestampToMicros).orNull) + } + FilterApi.in(longColumn(n), set) case ParquetTimestampMillisType if pushDownTimestamp => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(timestampToMillis).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(longColumn(n), statistics.genericGetMin().asInstanceOf[JLong]), - FilterApi.ltEq(longColumn(n), statistics.genericGetMax().asInstanceOf[JLong])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JLong]() + for (value <- values) { + set.add(Option(value).map(timestampToMillis).orNull) + } + FilterApi.in(longColumn(n), set) case ParquetSchemaType(_: DecimalLogicalTypeAnnotation, INT32, _) if pushDownDecimal => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(_.asInstanceOf[JBigDecimal]).map(decimalToInt32).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(intColumn(n), statistics.genericGetMin().asInstanceOf[Integer]), - FilterApi.ltEq(intColumn(n), statistics.genericGetMax().asInstanceOf[Integer])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Integer]() + for (value <- values) { + set.add(Option(value).map(d => decimalToInt32(d.asInstanceOf[JBigDecimal])).orNull) + } + FilterApi.in(intColumn(n), set) case ParquetSchemaType(_: DecimalLogicalTypeAnnotation, INT64, _) if pushDownDecimal => - (n: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(_.asInstanceOf[JBigDecimal]).map(decimalToInt64).foreach(statistics.updateStats(_)) - FilterApi.and( - FilterApi.gtEq(longColumn(n), statistics.genericGetMin().asInstanceOf[JLong]), - FilterApi.ltEq(longColumn(n), statistics.genericGetMax().asInstanceOf[JLong])) + (n: Array[String], values: Array[Any]) => + val set = new HashSet[JLong]() + for (value <- values) { + set.add(Option(value).map(d => decimalToInt64(d.asInstanceOf[JBigDecimal])).orNull) + } + FilterApi.in(longColumn(n), set) case ParquetSchemaType(_: DecimalLogicalTypeAnnotation, FIXED_LEN_BYTE_ARRAY, length) - if pushDownDecimal => - (path: Array[String], v: Array[Any], statistics: ParquetStatistics[_]) => - v.map(d => decimalToByteArray(d.asInstanceOf[JBigDecimal], length)) - .foreach(statistics.updateStats) - FilterApi.and( - FilterApi.gtEq(binaryColumn(path), statistics.genericGetMin().asInstanceOf[Binary]), - FilterApi.ltEq(binaryColumn(path), statistics.genericGetMax().asInstanceOf[Binary])) + if pushDownDecimal => + (n: Array[String], values: Array[Any]) => + val set = new HashSet[Binary]() + for (value <- values) { + set.add(Option(value) + .map(d => decimalToByteArray(d.asInstanceOf[JBigDecimal], length)).orNull) + } + FilterApi.in(binaryColumn(n), set) } // Returns filters that can be pushed down when reading Parquet files. @@ -736,15 +748,12 @@ class ParquetFilters( makeEq.lift(fieldType).map(_(fieldNames, v)) }.reduceLeftOption(FilterApi.or) } else if (canPartialPushDownConjuncts) { - val primitiveType = schema.getColumnDescription(fieldNames).getPrimitiveType - val statistics: ParquetStatistics[_] = ParquetStatistics.createStats(primitiveType) if (values.contains(null)) { Seq(makeEq.lift(fieldType).map(_(fieldNames, null)), - makeInPredicate.lift(fieldType) - .map(_(fieldNames, values.filter(_ != null), statistics)) + makeInPredicate.lift(fieldType).map(_(fieldNames, values.filter(_ != null))) ).flatten.reduceLeftOption(FilterApi.or) } else { - makeInPredicate.lift(fieldType).map(_(fieldNames, values, statistics)) + makeInPredicate.lift(fieldType).map(_(fieldNames, values)) } } else { None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala index a3637e5726..6c7c1bfe73 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala @@ -32,15 +32,27 @@ private object DB2Dialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:db2") + private val distinctUnsupportedAggregateFunctions = + Set("COVAR_POP", "COVAR_SAMP", "REGR_INTERCEPT", "REGR_R2", "REGR_SLOPE", "REGR_SXY") + // See https://www.ibm.com/docs/en/db2/11.5?topic=functions-aggregate private val supportedAggregateFunctions = Set("MAX", "MIN", "SUM", "COUNT", "AVG", - "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP", "COVAR_POP", "COVAR_SAMP") + "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP") ++ distinctUnsupportedAggregateFunctions private val supportedFunctions = supportedAggregateFunctions override def isSupportedFunction(funcName: String): Boolean = supportedFunctions.contains(funcName) class DB2SQLBuilder extends JDBCSQLBuilder { + override def visitAggregateFunction( + funcName: String, isDistinct: Boolean, inputs: Array[String]): String = + if (isDistinct && distinctUnsupportedAggregateFunctions.contains(funcName)) { + throw new UnsupportedOperationException(s"${this.getClass.getSimpleName} does not " + + s"support aggregate function: $funcName with DISTINCT"); + } else { + super.visitAggregateFunction(funcName, isDistinct, inputs) + } + override def dialectFunctionName(funcName: String): String = funcName match { case "VAR_POP" => "VARIANCE" case "VAR_SAMP" => "VARIANCE_SAMP" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala index cc04b5c7c9..7dc76eed49 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala @@ -22,13 +22,14 @@ import java.util import java.util.Locale import scala.collection.mutable.ArrayBuilder +import scala.util.control.NonFatal import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException} import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.connector.catalog.index.TableIndex -import org.apache.spark.sql.connector.expressions.{FieldReference, NamedReference} +import org.apache.spark.sql.connector.expressions.{Expression, FieldReference, NamedReference} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types.{BooleanType, DataType, FloatType, LongType, MetadataBuilder} @@ -38,14 +39,39 @@ private case object MySQLDialect extends JdbcDialect with SQLConfHelper { override def canHandle(url : String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:mysql") + private val distinctUnsupportedAggregateFunctions = + Set("VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP") + // See https://dev.mysql.com/doc/refman/8.0/en/aggregate-functions.html - private val supportedAggregateFunctions = Set("MAX", "MIN", "SUM", "COUNT", "AVG", - "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP") + private val supportedAggregateFunctions = + Set("MAX", "MIN", "SUM", "COUNT", "AVG") ++ distinctUnsupportedAggregateFunctions private val supportedFunctions = supportedAggregateFunctions override def isSupportedFunction(funcName: String): Boolean = supportedFunctions.contains(funcName) + class MySQLSQLBuilder extends JDBCSQLBuilder { + override def visitAggregateFunction( + funcName: String, isDistinct: Boolean, inputs: Array[String]): String = + if (isDistinct && distinctUnsupportedAggregateFunctions.contains(funcName)) { + throw new UnsupportedOperationException(s"${this.getClass.getSimpleName} does not " + + s"support aggregate function: $funcName with DISTINCT"); + } else { + super.visitAggregateFunction(funcName, isDistinct, inputs) + } + } + + override def compileExpression(expr: Expression): Option[String] = { + val mysqlSQLBuilder = new MySQLSQLBuilder() + try { + Some(mysqlSQLBuilder.build(expr)) + } catch { + case NonFatal(e) => + logWarning("Error occurs while compiling V2 expression", e) + None + } + } + override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala index 820bff354c..79ac248d72 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala @@ -20,7 +20,10 @@ package org.apache.spark.sql.jdbc import java.sql.{Date, Timestamp, Types} import java.util.{Locale, TimeZone} +import scala.util.control.NonFatal + import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.expressions.Expression import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -33,16 +36,42 @@ private case object OracleDialect extends JdbcDialect { override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:oracle") + private val distinctUnsupportedAggregateFunctions = + Set("VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP", "COVAR_POP", "COVAR_SAMP", "CORR", + "REGR_INTERCEPT", "REGR_R2", "REGR_SLOPE", "REGR_SXY") + // scalastyle:off line.size.limit // https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Aggregate-Functions.html#GUID-62BE676B-AF18-4E63-BD14-25206FEA0848 // scalastyle:on line.size.limit - private val supportedAggregateFunctions = Set("MAX", "MIN", "SUM", "COUNT", "AVG", - "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP", "COVAR_POP", "COVAR_SAMP", "CORR") + private val supportedAggregateFunctions = + Set("MAX", "MIN", "SUM", "COUNT", "AVG") ++ distinctUnsupportedAggregateFunctions private val supportedFunctions = supportedAggregateFunctions override def isSupportedFunction(funcName: String): Boolean = supportedFunctions.contains(funcName) + class OracleSQLBuilder extends JDBCSQLBuilder { + override def visitAggregateFunction( + funcName: String, isDistinct: Boolean, inputs: Array[String]): String = + if (isDistinct && distinctUnsupportedAggregateFunctions.contains(funcName)) { + throw new UnsupportedOperationException(s"${this.getClass.getSimpleName} does not " + + s"support aggregate function: $funcName with DISTINCT"); + } else { + super.visitAggregateFunction(funcName, isDistinct, inputs) + } + } + + override def compileExpression(expr: Expression): Option[String] = { + val oracleSQLBuilder = new OracleSQLBuilder() + try { + Some(oracleSQLBuilder.build(expr)) + } catch { + case NonFatal(e) => + logWarning("Error occurs while compiling V2 expression", e) + None + } + } + private def supportTimeZoneTypes: Boolean = { val timeZone = DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone) // TODO: support timezone types when users are not using the JVM timezone, which diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index cb78bc806e..878d7a7cfe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -38,7 +38,8 @@ private object PostgresDialect extends JdbcDialect with SQLConfHelper { // See https://www.postgresql.org/docs/8.4/functions-aggregate.html private val supportedAggregateFunctions = Set("MAX", "MIN", "SUM", "COUNT", "AVG", - "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP", "COVAR_POP", "COVAR_SAMP", "CORR") + "VAR_POP", "VAR_SAMP", "STDDEV_POP", "STDDEV_SAMP", "COVAR_POP", "COVAR_SAMP", "CORR", + "REGR_INTERCEPT", "REGR_R2", "REGR_SLOPE", "REGR_SXY") private val supportedFunctions = supportedAggregateFunctions override def isSupportedFunction(funcName: String): Boolean = diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 78a9ce7c38..2859f7f7a6 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -320,6 +320,7 @@ | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct | | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct | | org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number | SELECT try_to_number('454', '999') | struct | +| org.apache.spark.sql.catalyst.expressions.TryToTimestampExpressionBuilder | try_to_timestamp | SELECT try_to_timestamp('2016-12-31 00:12:00') | struct | | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct | | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct | | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql new file mode 100644 index 0000000000..ede47f3eec --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql @@ -0,0 +1 @@ +--IMPORT try_datetime_functions.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql b/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql new file mode 100644 index 0000000000..7cf67dce2a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql @@ -0,0 +1,6 @@ +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd'); +select try_to_timestamp(1); +select try_to_timestamp('2016-12-31 abc'); +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select try_to_timestamp("02-29", "MM-dd"); +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out new file mode 100644 index 0000000000..a2326ee081 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out @@ -0,0 +1,49 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select try_to_timestamp(1) +-- !query schema +struct +-- !query output +1969-12-31 16:00:01 + + +-- !query +select try_to_timestamp('2016-12-31 abc') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +[INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION] You may get a different result due to the upgrading to Spark >= 3.0: +Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html diff --git a/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out b/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out new file mode 100644 index 0000000000..a2326ee081 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out @@ -0,0 +1,49 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select try_to_timestamp(1) +-- !query schema +struct +-- !query output +1969-12-31 16:00:01 + + +-- !query +select try_to_timestamp('2016-12-31 abc') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +[INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION] You may get a different result due to the upgrading to Spark >= 3.0: +Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 3df99e9d7e..55b6eaa95e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -22,6 +22,7 @@ import java.math.{BigDecimal => JBigDecimal} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.time.{Duration, LocalDate, LocalDateTime, Period, ZoneId} +import java.util.HashSet import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag @@ -29,7 +30,7 @@ import scala.reflect.runtime.universe.TypeTag import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.Operators.{Column => _, Eq, Gt, GtEq, Lt, LtEq, NotEq, UserDefinedByInstance} +import org.apache.parquet.filter2.predicate.Operators.{Column => _, Eq, Gt, GtEq, In => FilterIn, Lt, LtEq, NotEq, UserDefinedByInstance} import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetOutputFormat} import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.parquet.schema.MessageType @@ -201,7 +202,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(tsAttr, Array(ts2.ts, ts3.ts, ts4.ts, "2021-05-01 00:01:02".ts).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(ts2)), Row(resultFun(ts3)), Row(resultFun(ts4)))) } } @@ -362,7 +363,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(intAttr, Array(2, 3, 4, 5, 6, 7).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2)), Row(resultFun(3)), Row(resultFun(4)))) } } @@ -406,7 +407,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(longAttr, Array(2L, 3L, 4L, 5L, 6L, 7L).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2L)), Row(resultFun(3L)), Row(resultFun(4L)))) } } @@ -450,7 +451,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(floatAttr, Array(2F, 3F, 4F, 5F, 6F, 7F).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2F)), Row(resultFun(3F)), Row(resultFun(4F)))) } } @@ -494,7 +495,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(doubleAttr, Array(2.0D, 3.0D, 4.0D, 5.0D, 6.0D, 7.0D).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2D)), Row(resultFun(3D)), Row(resultFun(4F)))) } } @@ -538,7 +539,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(stringAttr, Array("2", "3", "4", "5", "6", "7").map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun("2")), Row(resultFun("3")), Row(resultFun("4")))) } } @@ -587,7 +588,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(binaryAttr, Array(2.b, 3.b, 4.b, 5.b, 6.b, 7.b).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2.b)), Row(resultFun(3.b)), Row(resultFun(4.b)))) } } @@ -664,7 +665,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared checkFilterPredicate( In(dateAttr, Array("2018-03-19".date, "2018-03-20".date, "2018-03-21".date, "2018-03-22".date).map(Literal.apply)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun("2018-03-19")), Row(resultFun("2018-03-20")), Row(resultFun("2018-03-21")))) } @@ -773,7 +774,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared checkFilterPredicate( In(decimalAttr, Array(2, 3, 4, 5).map(Literal.apply) .map(_.cast(DecimalType(precision, 2)))), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(2)), Row(resultFun(3)), Row(resultFun(4)))) } } @@ -1659,25 +1660,29 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared val parquetFilters = createParquetFilters( new SparkToParquetSchemaConverter(conf).convert(StructType.fromDDL("a int"))) - assertResult(Some(and( - FilterApi.gtEq(intColumn("a"), 1: Integer), - FilterApi.ltEq(intColumn("a"), 20: Integer))) - ) { + var set = new HashSet[Integer]() + (1 to 20) foreach (n => set.add(n)) + + assertResult(Some(FilterApi.in(intColumn("a"), set))) { parquetFilters.createFilter(sources.In("a", (1 to 20).toArray)) } - assertResult(Some(and( - FilterApi.gtEq(intColumn("a"), -200: Integer), - FilterApi.ltEq(intColumn("a"), 40: Integer))) - ) { + set = new HashSet[Integer]() + set.add(-100) + set.add(10) + set.add(-200) + set.add(40) + assertResult(Some(FilterApi.in(intColumn("a"), set))) { parquetFilters.createFilter(sources.In("A", Array(-100, 10, -200, 40))) } - assertResult(Some(or( - FilterApi.eq(intColumn("a"), null: Integer), - and( - FilterApi.gtEq(intColumn("a"), 2: Integer), - FilterApi.ltEq(intColumn("a"), 7: Integer)))) + set = new HashSet[Integer]() + set.add(2) + set.add(3) + set.add(7) + set.add(6) + assertResult( + Some(or(FilterApi.eq(intColumn("a"), null: Integer), FilterApi.in(intColumn("a"), set))) ) { parquetFilters.createFilter(sources.In("a", Array(2, 3, 7, null, 6))) } @@ -1955,7 +1960,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(iAttr, Array(2, 3, 4, 5, 6, 7).map(monthsLit)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(months(2))), Row(resultFun(months(3))), Row(resultFun(months(4))))) } } @@ -2001,7 +2006,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { checkFilterPredicate( In(iAttr, Array(2, 3, 4, 5, 6, 7).map(secsLit)), - if (threshold == 3) classOf[Operators.And] else classOf[Operators.Or], + if (threshold == 3) classOf[FilterIn[_]] else classOf[Operators.Or], Seq(Row(resultFun(secs(2))), Row(resultFun(secs(3))), Row(resultFun(secs(4))))) } } @@ -2011,33 +2016,39 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared test("SPARK-38825: in and notIn filters") { import testImplicits._ withTempPath { file => - Seq(1, 2, 0, -1, 99, Integer.MAX_VALUE, 1000, 3, 7, Integer.MIN_VALUE, 2) - .toDF("id").coalesce(1).write.mode("overwrite") - .parquet(file.getCanonicalPath) - var df = spark.read.parquet(file.getCanonicalPath) - var in = df.filter(col("id").isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE)) - var notIn = - df.filter(!col("id").isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE)) - checkAnswer(in, Seq(Row(3), Row(-2147483648), Row(2147483647))) - checkAnswer(notIn, Seq(Row(1), Row(2), Row(0), Row(-1), Row(99), Row(1000), Row(7), Row(2))) - - Seq("mary", "martin", "lucy", "alex", null, "mary", "dan").toDF("name").coalesce(1) - .write.mode("overwrite").parquet(file.getCanonicalPath) - df = spark.read.parquet(file.getCanonicalPath) - in = df.filter(col("name").isin("mary", "victor", "leo", "alex")) - notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex")) - checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary"))) - checkAnswer(notIn, Seq(Row("martin"), Row("lucy"), Row("dan"))) - - in = df.filter(col("name").isin("mary", "victor", "leo", "alex", null)) - notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex", null)) - checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary"))) - checkAnswer(notIn, Seq()) - - in = df.filter(col("name").isin(null)) - notIn = df.filter(!col("name").isin(null)) - checkAnswer(in, Seq()) - checkAnswer(notIn, Seq()) + Seq(3, 20).foreach { threshold => + withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD.key -> s"$threshold") { + Seq(1, 2, 0, -1, 99, Integer.MAX_VALUE, 1000, 3, 7, Integer.MIN_VALUE, 2) + .toDF("id").coalesce(1).write.mode("overwrite") + .parquet(file.getCanonicalPath) + var df = spark.read.parquet(file.getCanonicalPath) + var in = df.filter(col("id") + .isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE)) + var notIn = df.filter(!col("id") + .isin(100, 3, 11, 12, 13, Integer.MAX_VALUE, Integer.MIN_VALUE)) + checkAnswer(in, Seq(Row(3), Row(-2147483648), Row(2147483647))) + checkAnswer(notIn, + Seq(Row(1), Row(2), Row(0), Row(-1), Row(99), Row(1000), Row(7), Row(2))) + + Seq("mary", "martin", "lucy", "alex", null, "mary", "dan").toDF("name").coalesce(1) + .write.mode("overwrite").parquet(file.getCanonicalPath) + df = spark.read.parquet(file.getCanonicalPath) + in = df.filter(col("name").isin("mary", "victor", "leo", "alex")) + notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex")) + checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary"))) + checkAnswer(notIn, Seq(Row("martin"), Row("lucy"), Row("dan"))) + + in = df.filter(col("name").isin("mary", "victor", "leo", "alex", null)) + notIn = df.filter(!col("name").isin("mary", "victor", "leo", "alex", null)) + checkAnswer(in, Seq(Row("mary"), Row("alex"), Row("mary"))) + checkAnswer(notIn, Seq()) + + in = df.filter(col("name").isin(null)) + notIn = df.filter(!col("name").isin(null)) + checkAnswer(in, Seq()) + checkAnswer(notIn, Seq()) + } + } } } }