diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 63331d6cab3..3ee7f0b615c 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -223,11 +223,14 @@ else export PYSP_TEST_spark_jars="${ALL_JARS//:/,}" fi + # time zone will be tested + TEST_TZ=${TEST_TZ:-UTC} + # Set the Delta log cache size to prevent the driver from caching every Delta log indefinitely - export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=UTC -Ddelta.log.cacheSize=10 $COVERAGE_SUBMIT_FLAGS" - export PYSP_TEST_spark_executor_extraJavaOptions='-ea -Duser.timezone=UTC' + export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=$TEST_TZ -Ddelta.log.cacheSize=10 $COVERAGE_SUBMIT_FLAGS" + export PYSP_TEST_spark_executor_extraJavaOptions="-ea -Duser.timezone=$TEST_TZ" export PYSP_TEST_spark_ui_showConsoleProgress='false' - export PYSP_TEST_spark_sql_session_timeZone='UTC' + export PYSP_TEST_spark_sql_session_timeZone=$TEST_TZ export PYSP_TEST_spark_sql_shuffle_partitions='4' # prevent cluster shape to change export PYSP_TEST_spark_dynamicAllocation_enabled='false' diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index dd683c04fd2..60a6095d925 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -17,7 +17,7 @@ from pyspark.sql.types import * from asserts import assert_gpu_and_cpu_are_equal_collect, assert_cpu_and_gpu_are_equal_collect_with_capture from data_gen import * -from marks import ignore_order, allow_non_gpu +from marks import ignore_order, allow_non_gpu, disable_timezone_test from spark_session import with_cpu_session, is_databricks113_or_later _adaptive_conf = { "spark.sql.adaptive.enabled": "true" } @@ -195,6 +195,7 @@ def do_it(spark): @ignore_order(local=True) @allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow) @pytest.mark.parametrize('join', joins, ids=idfn) +@disable_timezone_test def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join): data_path = spark_tmp_path + '/PARQUET_DATA' def prep(spark): diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py index 27b33848c2c..6860db90532 100644 --- a/integration_tests/src/main/python/asserts.py +++ b/integration_tests/src/main/python/asserts.py @@ -213,6 +213,21 @@ def bring_back(spark): return (df.collect(), df) collect_type = 'COLLECT' return (bring_back, collect_type) + elif mode == "COLLECT_ERROR_WITH_DATAFRAME": + def bring_back(spark): + """ + return collect error and df + if there is no error, collect error is empty string + """ + df = limit_func(spark) + collect_error = "" + try: + df.collect() + except Exception as e: + collect_error = str(e) + return (collect_error, df) + collect_type = 'COLLECT' + return (bring_back, collect_type) else: bring_back = lambda spark: limit_func(spark).toLocalIterator() collect_type = 'ITERATOR' @@ -444,6 +459,30 @@ def assert_gpu_fallback_collect(func, assert_equal(from_cpu, from_gpu) +def assert_gpu_fallback_and_collect_with_error(func, + cpu_fallback_class_name, + error_message, + conf={}): + (bring_back, collect_type) = _prep_func_for_compare(func, 'COLLECT_ERROR_WITH_DATAFRAME') + conf = _prep_incompat_conf(conf) + + print('### CPU RUN ###') + cpu_start = time.time() + collect_error, cpu_df = with_cpu_session(bring_back, conf=conf) + assert error_message in collect_error, f"Expected error '{error_message}' did not appear in '{collect_error}'" + cpu_end = time.time() + + print('### GPU RUN ###') + gpu_start = time.time() + collect_error, gpu_df = with_gpu_session(bring_back, conf=conf) + assert error_message in collect_error, f"Expected error '{error_message}' did not appear in '{collect_error}'" + gpu_end = time.time() + jvm = spark_jvm() + jvm.org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback.assertDidFallBack(gpu_df._jdf, cpu_fallback_class_name) + print('### {}: GPU TOOK {} CPU TOOK {} ###'.format(collect_type, + gpu_end - gpu_start, cpu_end - cpu_start)) + + def assert_gpu_sql_fallback_collect(df_fun, cpu_fallback_class_name, table_name, sql, conf=None, debug=False): if conf is None: conf = {} @@ -622,6 +661,25 @@ def do_it_all(spark): return spark.sql(sql) assert_gpu_and_cpu_are_equal_collect(do_it_all, conf, is_cpu_first=is_cpu_first) +def assert_gpu_fallback_sql(df_fun, table_name, sql, fallback_class_name, conf=None): + """ + Assert that the specified SQL query produces equal results on CPU and GPU. + :param df_fun: a function that will create the dataframe + :param table_name: Name of table to be created with the dataframe + :param sql: SQL query to be run on the specified table + :param fallback_class_name: Name of the class that GPU falls back to + :param conf: Any user-specified confs. Empty by default. + :return: Assertion failure, if results from CPU and GPU do not match. + """ + if conf is None: + conf = {} + def do_it_all(spark): + df = df_fun(spark) + df.createOrReplaceTempView(table_name) + return spark.sql(sql) + assert_gpu_fallback_collect(do_it_all, fallback_class_name, conf) + + def assert_spark_exception(func, error_message): """ Assert that a specific Java exception is thrown diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 16c946b2811..b72087f6a3a 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -14,7 +14,8 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_spark_exception +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_spark_exception, assert_gpu_fallback_and_collect_with_error +from conftest import is_utc, is_not_utc from data_gen import * from spark_session import is_before_spark_320, is_before_spark_330, is_spark_340_or_later, \ is_databricks113_or_later @@ -61,6 +62,7 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") def test_cast_string_date_valid_format(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. @@ -68,6 +70,17 @@ def test_cast_string_date_valid_format(): lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())), conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +def test_cast_string_date_valid_format_for_non_utc(): + # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. + # This provides values that are valid in all of those formats. + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(DateType())), + "Cast", + conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) + + invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim '1970 T', ' 1970-01T', '1970-01 A', # not conform to "yyyy-[M]M" after trim # not conform to 'yyyy-[M]M-[d]d', "yyyy-[M]M-[d]d *" or "yyyy-[M]M-[d]d T*" after trim @@ -89,6 +102,7 @@ def test_cast_string_date_valid_format(): # Spark 320+ and databricks support Ansi mode when casting string to date # This means an exception will be thrown when casting invalid string to date on Spark 320+ or databricks # test Spark versions < 3.2.0 and non databricks, ANSI mode +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") @pytest.mark.skipif(not is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+ or db") def test_cast_string_date_invalid_ansi_before_320(): data_rows = [(v,) for v in values_string_to_data] @@ -97,8 +111,22 @@ def test_cast_string_date_invalid_ansi_before_320(): conf={'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.sql.ansi.enabled': 'true'}, ) +# Spark 320+ and databricks support Ansi mode when casting string to date +# This means an exception will be thrown when casting invalid string to date on Spark 320+ or databricks +# test Spark versions < 3.2.0 and non databricks, ANSI mode +@allow_non_gpu("ProjectExec") +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@pytest.mark.skipif(not is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+ or db") +def test_cast_string_date_invalid_ansi_before_320_for_non_utc(): + data_rows = [(v,) for v in values_string_to_data] + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), + conf={'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.sql.ansi.enabled': 'true'}, ) + # test Spark versions >= 320 and databricks, ANSI mode, valid values @pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date") +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") def test_cast_string_date_valid_ansi(): data_rows = [(v,) for v in valid_values_string_to_date] assert_gpu_and_cpu_are_equal_collect( @@ -106,6 +134,20 @@ def test_cast_string_date_valid_ansi(): conf={'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.sql.ansi.enabled': 'true'}) + +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date") +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +def test_cast_string_date_valid_ansi_for_non_utc(): + data_rows = [(v,) for v in valid_values_string_to_date] + assert_gpu_fallback_collect( + lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), + 'Cast', + conf={'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.sql.ansi.enabled': 'true'}) + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") # test Spark versions >= 320, ANSI mode @pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") @pytest.mark.parametrize('invalid', invalid_values_string_to_date) @@ -117,6 +159,21 @@ def test_cast_string_date_invalid_ansi(invalid): error_message="DateTimeException") +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +# test Spark versions >= 320, ANSI mode +@pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") +@pytest.mark.parametrize('invalid', invalid_values_string_to_date) +def test_cast_string_date_invalid_ansi_for_non_utc(invalid): + assert_gpu_fallback_and_collect_with_error( + lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())), + 'Cast', + error_message="DateTimeException", + conf={'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.sql.ansi.enabled': 'true'} + ) + + # test try_cast in Spark versions >= 320 and < 340 @pytest.mark.skipif(is_before_spark_320() or is_spark_340_or_later() or is_databricks113_or_later(), reason="try_cast only in Spark 3.2+") @allow_non_gpu('ProjectExec', 'TryCast') @@ -139,6 +196,7 @@ def test_try_cast_fallback_340(invalid): conf={'spark.rapids.sql.hasExtendedYearValues': False, 'spark.sql.ansi.enabled': True}) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") # test all Spark versions, non ANSI mode, invalid value will be converted to NULL def test_cast_string_date_non_ansi(): data_rows = [(v,) for v in values_string_to_data] @@ -146,6 +204,19 @@ def test_cast_string_date_non_ansi(): lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) + +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +# test all Spark versions, non ANSI mode, invalid value will be converted to NULL +def test_cast_string_date_non_ansi_for_non_utc(): + data_rows = [(v,) for v in values_string_to_data] + assert_gpu_fallback_collect( + lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), + 'Cast', + conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to TimeStampType") @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], @@ -158,6 +229,22 @@ def test_cast_string_ts_valid_format(data_gen): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to TimeStampType") +@pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'), + StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), + StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], + ids=idfn) +def test_cast_string_ts_valid_format_for_non_utc(data_gen): + # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. + # This provides values that are valid in all of those formats. + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())), + 'Cast', + conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) + + @allow_non_gpu('ProjectExec', 'Cast', 'Alias') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years") def test_cast_string_date_fallback(): @@ -294,14 +381,37 @@ def _assert_cast_to_string_equal (data_gen, conf): conf ) +# split all_array_gens_for_cast_to_string +# remove below split and merge tests: "TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String" +gens_for_non_utc_strs = [ + "Array(Date)", "Array(Timestamp)", "Array(Map(Byte(not_null),Date))", "Array(Struct(['child0', Byte],['child1', String],['child2', Date]))"] +gens_for_utc, gens_for_non_utc = split_list(all_array_gens_for_cast_to_string, gens_for_non_utc_strs) -@pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) +@pytest.mark.parametrize('data_gen', gens_for_utc, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( - data_gen, + data_gen, + {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String") +@pytest.mark.parametrize('data_gen', gens_for_non_utc, ids=idfn) +@pytest.mark.parametrize('legacy', ['true', 'false']) +def test_cast_array_to_string_2(data_gen, legacy): + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, data_gen).select(f.col('a').cast("STRING")), {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for from Cast from Date/Timestamp to String") +@pytest.mark.parametrize('data_gen', gens_for_non_utc, ids=idfn) +@pytest.mark.parametrize('legacy', ['true', 'false']) +def test_cast_array_to_string_2_for_non_utc(data_gen, legacy): + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, data_gen).select(f.col('a').cast("STRING")), + 'Cast', + {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) @pytest.mark.parametrize('data_gen', [ArrayGen(sub) for sub in not_matched_struct_array_gens_for_cast_to_string], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) @@ -313,14 +423,35 @@ def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) - -@pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) +# split basic_map_gens_for_cast_to_string +# remove below split and merge tests: "TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String" +split_items = ['Map(Date(not_null),Date)', 'Map(Timestamp(not_null),Timestamp)'] +gens_for_utc, gens_for_non_utc = split_list(basic_map_gens_for_cast_to_string, split_items) +@pytest.mark.parametrize('data_gen', gens_for_utc, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) def test_cast_map_to_string(data_gen, legacy): + _assert_cast_to_string_equal( + data_gen, + {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String") +@pytest.mark.parametrize('data_gen', gens_for_non_utc, ids=idfn) +@pytest.mark.parametrize('legacy', ['true', 'false']) +def test_cast_map_to_string_2(data_gen, legacy): _assert_cast_to_string_equal( data_gen, {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String") +@pytest.mark.parametrize('data_gen', gens_for_non_utc, ids=idfn) +@pytest.mark.parametrize('legacy', ['true', 'false']) +def test_cast_map_to_string_2_for_non_utc(data_gen, legacy): + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, data_gen).select(f.col('a').cast("STRING")), + 'Cast', + {"spark.sql.legacy.castComplexTypesToString.enabled": legacy}) + @pytest.mark.parametrize('data_gen', not_matched_map_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) @@ -333,6 +464,7 @@ def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): ) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String") @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) def test_cast_struct_to_string(data_gen, legacy): @@ -341,6 +473,17 @@ def test_cast_struct_to_string(data_gen, legacy): {"spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Date/Timestamp to String") +@pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) +@pytest.mark.parametrize('legacy', ['true', 'false']) +def test_cast_struct_to_string_for_non_utc(data_gen, legacy): + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, data_gen).select(f.col('a').cast("STRING")), + 'Cast', + {"spark.sql.legacy.castComplexTypesToString.enabled": legacy} + ) + # https://github.com/NVIDIA/spark-rapids/issues/2309 @pytest.mark.parametrize('cast_conf', ['LEGACY', 'SPARK311+']) def test_one_nested_null_field_legacy_cast(cast_conf): @@ -506,11 +649,21 @@ def test_cast_timestamp_to_numeric_non_ansi(): .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Timestamp to String") def test_cast_timestamp_to_string(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as string)")) + +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from Timestamp to String") +def test_cast_timestamp_to_string_non_utc(): + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, timestamp_gen) + .selectExpr("cast(a as string)"), + 'Cast') + @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') def test_cast_day_time_interval_to_string(): _assert_cast_to_string_equal(DayTimeIntervalGen(start_field='day', end_field='day', special_cases=[MIN_DAY_TIME_INTERVAL, MAX_DAY_TIME_INTERVAL, timedelta(seconds=0)]), {}) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index 91fde8afeea..77de20f94dd 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -14,8 +14,10 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_utc, is_not_utc from data_gen import * +from marks import allow_non_gpu from spark_session import with_cpu_session, is_before_spark_330 from pyspark.sql.types import * import pyspark.sql.functions as f @@ -290,6 +292,7 @@ def test_filter_with_project(data_gen): # no columns to actually filter. We are making it happen here with a sub-query # and some constants that then make it so all we need is the number of rows # of input. +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for DateAddInterval") @pytest.mark.parametrize('op', ['>', '<']) def test_empty_filter(op, spark_tmp_path): @@ -307,6 +310,26 @@ def do_it(spark): return spark.sql(f"select * from empty_filter_test2 where test {op} current_date") assert_gpu_and_cpu_are_equal_collect(do_it) + +@allow_non_gpu('ProjectExec', 'FilterExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for DateAddInterval") +@pytest.mark.parametrize('op', ['>', '<']) +def test_empty_filter_for_non_utc(op, spark_tmp_path): + + def do_it(spark): + df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + # we repartition the data to 1 because for some reason Spark can write 4 files for 3 rows. + # In this case that causes a race condition with the last aggregation which can result + # in a null being returned. For some reason this happens a lot on the GPU in local mode + # and not on the CPU in local mode. + df.repartition(1).write.mode("overwrite").parquet(spark_tmp_path) + df = spark.read.parquet(spark_tmp_path) + curDate = df.withColumn("current_date", f.current_date()) + curDate.createOrReplaceTempView("empty_filter_test_curDate") + spark.sql("select current_date, ((select last(current_date) from empty_filter_test_curDate) + interval 1 day) as test from empty_filter_test_curDate").createOrReplaceTempView("empty_filter_test2") + return spark.sql(f"select * from empty_filter_test2 where test {op} current_date") + assert_gpu_fallback_collect(do_it, 'DateAddInterval') + def test_nondeterministic_filter(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, LongGen(), 1).filter(f.rand(0) > 0.5)) diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 5751323ecee..482fb78d927 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -14,8 +14,10 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_and_collect_with_error +from conftest import is_utc, is_not_utc from data_gen import * +from marks import allow_non_gpu from pyspark.sql.types import * from string_test import mk_str_gen import pyspark.sql.functions as f @@ -248,6 +250,7 @@ def test_sort_array_normalize_nans(): sequence_normal_no_step_integral_gens = [(gens[0], gens[1]) for gens in sequence_normal_integral_gens] +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") @pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) def test_sequence_without_step(start_gen, stop_gen): assert_gpu_and_cpu_are_equal_collect( @@ -256,7 +259,23 @@ def test_sequence_without_step(start_gen, stop_gen): "sequence(a, 20)", "sequence(20, b)")) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") +@pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) +def test_sequence_without_step_for_non_utc(start_gen, stop_gen): + assert_gpu_fallback_collect( + lambda spark: two_col_df(spark, start_gen, stop_gen).selectExpr( + "sequence(a, b)", + "sequence(a, 20)", + "sequence(20, b)"), + 'Sequence') + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") def test_sequence_with_step(start_gen, stop_gen, step_gen): # Get a step scalar from the 'step_gen' which follows the rules. step_gen.start(random.Random(0)) @@ -271,6 +290,24 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): "sequence(20, 20, c)", "sequence(20, b, {})".format(step_lit))) +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") +@pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) +def test_sequence_with_step_for_non_utc(start_gen, stop_gen, step_gen): + # Get a step scalar from the 'step_gen' which follows the rules. + step_gen.start(random.Random(0)) + step_lit = step_gen.gen() + assert_gpu_fallback_collect( + lambda spark: three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( + "sequence(a, b, c)", + "sequence(a, b, {})".format(step_lit), + "sequence(a, 20, c)", + "sequence(a, 20, {})".format(step_lit), + "sequence(20, b, c)", + "sequence(20, 20, c)", + "sequence(20, b, {})".format(step_lit)), + 'Sequence') + # Illegal sequence boundaries: # step > 0, but start > stop # step < 0, but start < stop @@ -299,6 +336,7 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): IntegerGen(min_val=0, max_val=0, special_cases=[])) ] +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( @@ -306,6 +344,18 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): "sequence(a, b, c)").collect(), conf = {}, error_message = "Illegal sequence boundaries") +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) +def test_sequence_illegal_boundaries_for_non_utc(start_gen, stop_gen, step_gen): + assert_gpu_fallback_and_collect_with_error( + lambda spark:three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( + "sequence(a, b, c)"), + 'Sequence', + error_message = "Illegal sequence boundaries", + conf = {}) + + # Exceed the max length of a sequence # "Too long sequence: xxxxxxxxxx. Should be <= 2147483632" sequence_too_long_length_gens = [ @@ -313,6 +363,7 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): LongGen(min_val=2147483635, max_val=2147483635, special_cases=[None]) ] +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( @@ -321,6 +372,18 @@ def test_sequence_too_long_sequence(stop_gen): "sequence(0, a)").collect(), conf = {}, error_message = "Too long sequence") +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") +@pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) +def test_sequence_too_long_sequence_for_non_utc(stop_gen): + assert_gpu_fallback_and_collect_with_error( + # To avoid OOM, reduce the row number to 1, it is enough to verify this case. + lambda spark:unary_op_df(spark, stop_gen, 1).selectExpr( + "sequence(0, a)"), + 'Sequence', + error_message = "Too long sequence", + conf = {}) + def get_sequence_cases_mixed_df(spark, length=2048): # Generate the sequence data following the 3 rules mixed in a single dataset. # (step > num.zero && start <= stop) || @@ -354,8 +417,18 @@ def get_sequence_data(gen, len): SparkContext.getOrCreate().parallelize(get_sequence_data(data_gen, length)), mixed_schema) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") # test for 3 cases mixed in a single dataset def test_sequence_with_step_mixed_cases(): assert_gpu_and_cpu_are_equal_collect( lambda spark: get_sequence_cases_mixed_df(spark) .selectExpr("sequence(a, b, c)")) + + +@allow_non_gpu('ProjectExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Sequence") +def test_sequence_with_step_mixed_cases_for_non_utc(): + assert_gpu_fallback_collect( + lambda spark: get_sequence_cases_mixed_df(spark) + .selectExpr("sequence(a, b, c)"), + 'Sequence') \ No newline at end of file diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index 006c500c5b6..9b2889c5793 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -16,6 +16,8 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from data_gen import * +from conftest import is_not_utc, is_utc +from marks import allow_non_gpu from spark_session import is_before_spark_320, is_jvm_charset_utf8 from pyspark.sql.types import * import pyspark.sql.functions as f @@ -229,6 +231,7 @@ def test_conditional_with_side_effects_case_when(data_gen): ELSE -1 END'), conf = test_conf) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) def test_conditional_with_side_effects_sequence(data_gen): assert_gpu_and_cpu_are_equal_collect( @@ -238,6 +241,18 @@ def test_conditional_with_side_effects_sequence(data_gen): ELSE null END'), conf = ansi_enabled_conf) +@allow_non_gpu('ProjectExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +def test_conditional_with_side_effects_sequence_for_non_utc(data_gen): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'CASE \ + WHEN length(a) > 0 THEN sequence(1, length(a), 1) \ + ELSE null END'), + conf = ansi_enabled_conf) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) def test_conditional_with_side_effects_sequence_cast(data_gen): @@ -248,6 +263,18 @@ def test_conditional_with_side_effects_sequence_cast(data_gen): ELSE null END'), conf = ansi_enabled_conf) +@allow_non_gpu('ProjectExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') +@pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +def test_conditional_with_side_effects_sequence_cast_for_non_utc(data_gen): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'CASE \ + WHEN length(a) > 0 THEN CAST(sequence(1, length(a), 1) AS STRING) \ + ELSE null END'), + conf = ansi_enabled_conf) + @pytest.mark.parametrize('data_gen', [ArrayGen(mk_str_gen('[a-z]{0,3}'))], ids=idfn) @pytest.mark.parametrize('ansi_enabled', ['true', 'false']) def test_conditional_with_side_effects_element_at(data_gen, ansi_enabled): diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py index 2ce21505686..fa30f0faa39 100644 --- a/integration_tests/src/main/python/conftest.py +++ b/integration_tests/src/main/python/conftest.py @@ -77,6 +77,15 @@ def is_emr_runtime(): def is_dataproc_runtime(): return runtime_env() == "dataproc" +def get_test_tz(): + return os.environ.get('PYSP_TEST_spark_sql_session_timeZone', 'UTC') + +def is_utc(): + return get_test_tz() == "UTC" + +def is_not_utc(): + return not is_utc() + _is_nightly_run = False _is_precommit_run = False @@ -226,6 +235,9 @@ def pytest_runtest_setup(item): if not item.config.getoption('pyarrow_test'): pytest.skip('tests for pyarrow not configured to run') + if item.get_closest_marker('disable_timezone_test'): + pytest.skip('Skip because this case is not ready for non UTC time zone') + def pytest_configure(config): global _runtime_env _runtime_env = config.getoption('runtime_env') @@ -415,3 +427,11 @@ def enable_fuzz_test(request): if not enable_fuzz_test: # fuzz tests are not required for any test runs pytest.skip("fuzz_test not configured to run") + +# Whether add a non UTC timezone test for all the existing test cases +# By default, test non UTC timezone +_enable_timezone_test = True + +def disable_timezone_test(): + global _enable_timezone_test + return _enable_timezone_test is False diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 19ad8d29151..78b577cbcaf 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -16,7 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_write, \ assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_fallback_collect -from conftest import get_non_gpu_allowed +from conftest import get_non_gpu_allowed, is_not_utc, is_utc from datetime import datetime, timezone from data_gen import * from marks import * @@ -197,8 +197,6 @@ def read_impl(spark): ('Performance_2007Q3.txt_0', _perf_schema, {'sep': '|'}), ('ts.csv', _date_schema, {}), ('date.csv', _date_schema, {}), - ('ts.csv', _ts_schema, {}), - ('str.csv', _ts_schema, {}), ('str.csv', _bad_str_schema, {'header': 'true'}), ('str.csv', _good_str_schema, {'header': 'true'}), ('no-comments.csv', _three_str_schema, {}), @@ -257,6 +255,35 @@ def test_basic_csv_read(std_input_path, name, schema, options, read_func, v1_ena assert_gpu_and_cpu_are_equal_collect(read_func(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), conf=updated_conf) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@approximate_float +@pytest.mark.parametrize('name,schema,options', [('ts.csv', _ts_schema, {}), ('str.csv', _ts_schema, {}),], ids=idfn) +@pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) +@pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +def test_basic_csv_read_with_tz(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): + updated_conf=copy_and_update(_enable_all_types_conf, { + 'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.ansi.enabled': ansi_enabled + }) + assert_gpu_and_cpu_are_equal_collect(read_func(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), + conf=updated_conf) + +@allow_non_gpu('FileSourceScanExec', 'BatchScanExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@approximate_float +@pytest.mark.parametrize('name,schema,options', [('ts.csv', _ts_schema, {}), ('str.csv', _ts_schema, {}),], ids=idfn) +@pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) +@pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +def test_basic_csv_read_with_tz_for_non_utc(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): + updated_conf=copy_and_update(_enable_all_types_conf, { + 'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.ansi.enabled': ansi_enabled + }) + assert_gpu_and_cpu_are_equal_collect(read_func(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), + conf=updated_conf) + @pytest.mark.parametrize('name,schema,options', [ pytest.param('small_float_values.csv', _float_schema, {'header': 'true'}), pytest.param('small_float_values.csv', _double_schema, {'header': 'true'}), @@ -272,7 +299,7 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, assert_gpu_and_cpu_are_equal_collect(read_func(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), conf=updated_conf) -csv_supported_gens = [ +csv_supported_gens_without_ts = [ # Spark does not escape '\r' or '\n' even though it uses it to mark end of record # This would require multiLine reads to work correctly so we avoid these chars StringGen('(\\w| |\t|\ud720){0,10}', nullable=False), @@ -283,11 +310,12 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, DoubleGen(no_nans=False), pytest.param(double_gen), pytest.param(FloatGen(no_nans=False)), - pytest.param(float_gen), - TimestampGen()] + pytest.param(float_gen)] + +csv_supported_gens = csv_supported_gens_without_ts + [TimestampGen()] @approximate_float -@pytest.mark.parametrize('data_gen', csv_supported_gens, ids=idfn) +@pytest.mark.parametrize('data_gen', csv_supported_gens_without_ts, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): gen = StructGen([('a', data_gen)], nullable=False) @@ -300,6 +328,37 @@ def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): lambda spark : spark.read.schema(schema).csv(data_path), conf=updated_conf) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@approximate_float +@pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) +@pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +def test_round_trip_with_ts(spark_tmp_path, data_gen, v1_enabled_list): + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/CSV_DATA' + schema = gen.data_type + updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + with_cpu_session( + lambda spark : gen_df(spark, gen).write.csv(data_path)) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : spark.read.schema(schema).csv(data_path), + conf=updated_conf) + +@allow_non_gpu('FileSourceScanExec', 'BatchScanExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@approximate_float +@pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) +@pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +def test_round_trip_with_ts_for_non_utc(spark_tmp_path, data_gen, v1_enabled_list): + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/CSV_DATA' + schema = gen.data_type + updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + with_cpu_session( + lambda spark : gen_df(spark, gen).write.csv(data_path)) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : spark.read.schema(schema).csv(data_path), + conf=updated_conf) + @allow_non_gpu('org.apache.spark.sql.execution.LeafExecNode') @pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) @pytest.mark.parametrize('disable_conf', ['spark.rapids.sql.format.csv.enabled', 'spark.rapids.sql.format.csv.read.enabled']) @@ -402,6 +461,7 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, "'T'HH:mm[:ss]", "'T'HH:mm"] +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @@ -424,6 +484,30 @@ def test_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_ .csv(data_path), conf=updated_conf) +@allow_non_gpu('FileSourceScanExec', 'BatchScanExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@pytest.mark.parametrize('ts_part', csv_supported_ts_parts) +@pytest.mark.parametrize('date_format', csv_supported_date_formats) +@pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +def test_ts_formats_round_trip_for_non_utc(spark_tmp_path, date_format, ts_part, v1_enabled_list): + full_format = date_format + ts_part + data_gen = TimestampGen() + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/CSV_DATA' + schema = gen.data_type + with_cpu_session( + lambda spark : gen_df(spark, gen).write \ + .option('timestampFormat', full_format) \ + .csv(data_path)) + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : spark.read \ + .schema(schema) \ + .option('timestampFormat', full_format) \ + .csv(data_path), + conf=updated_conf) + @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) def test_input_meta(spark_tmp_path, v1_enabled_list): gen = StructGen([('a', long_gen), ('b', long_gen)], nullable=False) @@ -617,11 +701,15 @@ def do_read(spark): non_exist_classes = cpu_scan_class, conf = conf) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') def test_csv_prefer_date_with_infer_schema(spark_tmp_path): - # start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 - data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))] + # start date ""0002-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 + # Can not use ""0001-01-02" as start date since in some timezone, it may be converted to 0000 + # for year which is not supported by pySpark + data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(2, 1, 2))] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(data_gens)] data_path = spark_tmp_path + '/CSV_DATA' @@ -636,6 +724,28 @@ def test_csv_prefer_date_with_infer_schema(spark_tmp_path): exist_classes = 'GpuFileSourceScanExec', non_exist_classes = 'FileSourceScanExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') +@pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') +def test_csv_prefer_date_with_infer_schema_for_non_utc(spark_tmp_path): + # start date ""0002-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 + # Can not use ""0001-01-02" as start date since in some timezone, it may be converted to 0000 + # for year which is not supported by pySpark + data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(2, 1, 2))] + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(data_gens)] + data_path = spark_tmp_path + '/CSV_DATA' + + with_cpu_session(lambda spark: gen_df(spark, gen_list).write.csv(data_path)) + + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark: spark.read.option("inferSchema", "true").csv(data_path), + exist_classes = 'FileSourceScanExec', + non_exist_classes = 'GpuFileSourceScanExec') + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark: spark.read.option("inferSchema", "true").option("preferDate", "false").csv(data_path), + exist_classes = 'FileSourceScanExec', + non_exist_classes = 'GpuFileSourceScanExec') + @allow_non_gpu('FileSourceScanExec') @pytest.mark.skipif(is_before_spark_340(), reason='enableDateTimeParsingFallback is supported from Spark3.4.0') @pytest.mark.parametrize('filename,schema',[("date.csv", _date_schema), ("date.csv", _ts_schema), diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index fdfc595bf4e..4d65b5d38cf 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -21,10 +21,9 @@ from pyspark.sql.types import * import pyspark.sql.functions as f import random -from spark_session import is_tz_utc, is_before_spark_340, with_cpu_session +from spark_session import is_before_spark_340, with_cpu_session import sre_yield import struct -from conftest import skip_unless_precommit_tests import time import os from functools import lru_cache @@ -749,10 +748,6 @@ def gen_bytes(): return bytes([ rand.randint(0, 255) for _ in range(length) ]) self._start(rand, gen_bytes) -def skip_if_not_utc(): - if (not is_tz_utc()): - skip_unless_precommit_tests('The java system time zone is not set to UTC') - # Note: Current(2023/06/06) maxmium IT data size is 7282688 bytes, so LRU cache with maxsize 128 # will lead to 7282688 * 128 = 932 MB additional memory usage in edge case, which is acceptable. @lru_cache(maxsize=128, typed=True) @@ -771,10 +766,6 @@ def gen_df(spark, data_gen, length=2048, seed=0, num_slices=None): # we cannot create a data frame from a nullable struct assert not data_gen.nullable - # Before we get too far we need to verify that we can run with timestamps - if src.contains_ts(): - skip_if_not_utc() - data = gen_df_help(src, length, seed) # We use `numSlices` to create an RDD with the specific number of partitions, @@ -822,10 +813,6 @@ def _gen_scalars_common(data_gen, count, seed=0): else: src = data_gen - # Before we get too far we need to verify that we can run with timestamps - if src.contains_ts(): - skip_if_not_utc() - rand = random.Random(seed) src.start(rand) return src @@ -1173,3 +1160,50 @@ def get_25_partitions_df(spark): StructField("c3", IntegerType())]) data = [[i, j, k] for i in range(0, 5) for j in range(0, 5) for k in range(0, 100)] return spark.createDataFrame(data, schema) + +# If timezone is non-UTC and rebase mode is LEGACY, writing to Parquet will fail because of GPU +# currently does not support. On Databricks, the default datetime rebase mode is LEGACY, +# it's different from regular Spark. Some of the cases will fall if timezone is non-UTC on DB. +# The following configs is for DB and ensure the rebase mode is not LEGACY on DB. +writer_confs_for_DB = { + 'spark.sql.parquet.datetimeRebaseModeInWrite': 'CORRECTED', + 'spark.sql.parquet.datetimeRebaseModeInRead': 'CORRECTED', + 'spark.sql.parquet.int96RebaseModeInWrite' : 'CORRECTED', + 'spark.sql.parquet.int96RebaseModeInRead' : 'CORRECTED' +} + + +def split_list(input_list, split_item_strings): + """ + Split a gen list to 2 sub-list: + the first does not contain `split_item_strings`, the second contains `split_item_strings` + """ + include = [] + rest = [] + for item in input_list: + find = False + for split_item_str in split_item_strings: + # convert to string, then compare + if (str(split_item_str) == str(item)): + include.append(item) + find = True + break; + if find: + include.append(item) + else: + rest.append(item) + return (rest, include) + +def split_timestamp(input_list): + """ + Split a gen list to 2 sub-list: + the first does not contain timestamp, the second contains timestamp + """ + ts_gens = [] + other_gens = [] + for item in input_list: + if 'Timestamp' in str(item): + ts_gens.append(item) + else: + other_gens.append(item) + return (other_gens, ts_gens) \ No newline at end of file diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 0e58be01c44..cb464762c9b 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -13,7 +13,8 @@ # limitations under the License. import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_and_cpu_error, assert_spark_exception, with_gpu_session +from conftest import is_utc, is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import ignore_order, incompat, allow_non_gpu @@ -24,7 +25,9 @@ # We only support literal intervals for TimeSub vals = [(-584, 1563), (1943, 1101), (2693, 2167), (2729, 0), (44, 1534), (2635, 3319), (1885, -2828), (0, 2463), (932, 2286), (0, 0)] + @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_timesub(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -32,7 +35,18 @@ def test_timesub(data_gen): lambda spark: unary_op_df(spark, TimestampGen(start=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) .selectExpr("a - (interval {} days {} seconds)".format(days, seconds))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', vals, ids=idfn) +def test_timesub_for_non_utc(data_gen): + days, seconds = data_gen + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, TimestampGen(start=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) + .selectExpr("a - (interval {} days {} seconds)".format(days, seconds)), + 'TimeAdd') + @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_timeadd(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -40,8 +54,19 @@ def test_timeadd(data_gen): # and beyond year 10000 while doing TimeAdd lambda spark: unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) .selectExpr("a + (interval {} days {} seconds)".format(days, seconds))) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', vals, ids=idfn) +def test_timeadd_for_non_utc(data_gen): + days, seconds = data_gen + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) + .selectExpr("a + (interval {} days {} seconds)".format(days, seconds)), + 'TimeAdd') @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_timeadd_daytime_column(): gen_list = [ # timestamp column max year is 1000 @@ -51,6 +76,19 @@ def test_timeadd_daytime_column(): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, gen_list).selectExpr("t + d", "t + INTERVAL '1 02:03:04' DAY TO SECOND")) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') +@allow_non_gpu('ProjectExec') +def test_timeadd_daytime_column_for_non_utc(): + gen_list = [ + # timestamp column max year is 1000 + ('t', TimestampGen(end=datetime(1000, 1, 1, tzinfo=timezone.utc))), + # max days is 8000 year, so added result will not be out of range + ('d', DayTimeIntervalGen(min_value=timedelta(days=0), max_value=timedelta(days=8000 * 365)))] + assert_gpu_fallback_collect( + lambda spark: gen_df(spark, gen_list).selectExpr("t + d", "t + INTERVAL '1 02:03:04' DAY TO SECOND"), + 'TimeAdd') + @pytest.mark.skipif(is_before_spark_350(), reason='DayTimeInterval overflow check for seconds is not supported before Spark 3.5.0') def test_interval_seconds_overflow_exception(): assert_gpu_and_cpu_error( @@ -58,8 +96,8 @@ def test_interval_seconds_overflow_exception(): conf={}, error_message="IllegalArgumentException") -@pytest.mark.parametrize('data_gen', vals, ids=idfn) -def test_timeadd_from_subquery(data_gen): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_timeadd_from_subquery(): def fun(spark): df = unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) @@ -69,8 +107,20 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) -@pytest.mark.parametrize('data_gen', vals, ids=idfn) -def test_timesub_from_subquery(data_gen): +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec', 'FilterExec') +def test_timeadd_from_subquery_for_non_utc(): + + def fun(spark): + df = unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) + df.createOrReplaceTempView("testTime") + spark.sql("select a, ((select max(a) from testTime) + interval 1 day) as datePlus from testTime").createOrReplaceTempView("testTime2") + return spark.sql("select * from testTime2 where datePlus > current_timestamp") + + assert_gpu_fallback_collect(fun, 'TimeAdd') + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_timesub_from_subquery(): def fun(spark): df = unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) @@ -80,11 +130,24 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec', 'FilterExec') +def test_timesub_from_subquery_for_non_utc(): + + def fun(spark): + df = unary_op_df(spark, TimestampGen(start=datetime(5, 1, 1, tzinfo=timezone.utc), end=datetime(15, 1, 1, tzinfo=timezone.utc)), seed=1) + df.createOrReplaceTempView("testTime") + spark.sql("select a, ((select min(a) from testTime) - interval 1 day) as dateMinus from testTime").createOrReplaceTempView("testTime2") + return spark.sql("select * from testTime2 where dateMinus < current_timestamp") + + assert_gpu_fallback_collect(fun, 'TimeAdd') + # Should specify `spark.sql.legacy.interval.enabled` to test `DateAddInterval` after Spark 3.2.0, # refer to https://issues.apache.org/jira/browse/SPARK-34896 # [SPARK-34896][SQL] Return day-time interval from dates subtraction # 1. Add the SQL config `spark.sql.legacy.interval.enabled` which will control when Spark SQL should use `CalendarIntervalType` instead of ANSI intervals. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for DateAddInterval") def test_dateaddinterval(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -93,8 +156,20 @@ def test_dateaddinterval(data_gen): 'a - (interval {} days {} seconds)'.format(days, seconds)), legacy_interval_enabled_conf) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for DateAddInterval") +@allow_non_gpu('ProjectExec', 'RDDScanExec') +@pytest.mark.parametrize('data_gen', vals, ids=idfn) +def test_dateaddinterval_for_non_utc(data_gen): + days, seconds = data_gen + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, DateGen(start=date(200, 1, 1), end=date(800, 1, 1)), seed=1) + .selectExpr('a + (interval {} days {} seconds)'.format(days, seconds), + 'a - (interval {} days {} seconds)'.format(days, seconds)), + 'DateAddInterval') + # test add days(not specify hours, minutes, seconds, milliseconds, microseconds) in ANSI mode. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_dateaddinterval_ansi(data_gen): days, _ = data_gen # only specify the `days` @@ -103,6 +178,17 @@ def test_dateaddinterval_ansi(data_gen): .selectExpr('a + (interval {} days)'.format(days)), conf=copy_and_update(ansi_enabled_conf, legacy_interval_enabled_conf)) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', vals, ids=idfn) +def test_dateaddinterval_ansi_for_non_utc(data_gen): + days, _ = data_gen + # only specify the `days` + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, DateGen(start=date(200, 1, 1), end=date(800, 1, 1)), seed=1) + .selectExpr('a + (interval {} days)'.format(days)), + 'RDDScanExec') + # Throws if add hours, minutes or seconds, milliseconds, microseconds to a date in ANSI mode def test_dateaddinterval_ansi_exception(): assert_gpu_and_cpu_error( @@ -122,17 +208,41 @@ def test_datediff(data_gen): 'datediff(a, date(null))', 'datediff(a, \'2016-03-02\')')) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_hour(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)')) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +def test_hour_for_non_utc(): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)'), + 'Hour') + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_minute(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)')) +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +def test_minute_for_non_utc(): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)'), + 'Minute') + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_second(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)')) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +def test_second_for_non_utc(): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)'), + 'Second') def test_quarter(): assert_gpu_and_cpu_are_equal_collect( @@ -186,8 +296,10 @@ def test_datesub(data_gen): # than -106032829 for date('0001-01-01') so we have to cap the days values to the lower upper and # lower ranges. to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(min_val=-106032829, max_val=103819094, special_cases=[-106032829, 103819094,0,1,-1])] + @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_dateadd_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -198,9 +310,24 @@ def test_dateadd_with_date_overflow(data_gen): 'unix_timestamp(date_add(a, cast(null as {})))'.format(string_type), 'unix_timestamp(date_add(a, cast(24 as {})))'.format(string_type))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) +def test_dateadd_with_date_overflow_for_non_utc(data_gen): + string_type = to_cast_string(data_gen.data_type) + assert_gpu_fallback_collect( + lambda spark : two_col_df(spark, DateGen(), + data_gen).selectExpr('unix_timestamp(date_add(a, b))', + 'unix_timestamp(date_add( date(\'2016-03-02\'), b))', + 'unix_timestamp(date_add(date(null), b))', + 'unix_timestamp(date_add(a, cast(null as {})))'.format(string_type), + 'unix_timestamp(date_add(a, cast(24 as {})))'.format(string_type)), + 'DateAdd') + to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(max_val=106032829, min_val=-103819094, special_cases=[106032829, -103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_datesub_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -211,6 +338,20 @@ def test_datesub_with_date_overflow(data_gen): 'unix_timestamp(date_sub(a, cast(null as {})))'.format(string_type), 'unix_timestamp(date_sub(a, cast(24 as {})))'.format(string_type))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) +def test_datesub_with_date_overflow_for_non_utc(data_gen): + string_type = to_cast_string(data_gen.data_type) + assert_gpu_fallback_collect( + lambda spark : two_col_df(spark, DateGen(), + data_gen).selectExpr('unix_timestamp(date_sub(a, b))', + 'unix_timestamp(date_sub( date(\'2016-03-02\'), b))', + 'unix_timestamp(date_sub(date(null), b))', + 'unix_timestamp(date_sub(a, cast(null as {})))'.format(string_type), + 'unix_timestamp(date_sub(a, cast(24 as {})))'.format(string_type)), + 'UnixTimestamp') + @pytest.mark.parametrize('data_gen', date_gens, ids=idfn) def test_year(data_gen): assert_gpu_and_cpu_are_equal_collect( @@ -232,10 +373,18 @@ def test_dayofyear(data_gen): lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a')))) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a')))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_unix_timestamp_for_non_utc(data_gen): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))), + 'UnixTimestamp') @allow_non_gpu('ProjectExec') @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @@ -248,11 +397,19 @@ def test_unsupported_fallback_unix_timestamp(data_gen): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") def test_to_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_to_unix_timestamp_for_non_utc(data_gen): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), + 'ToUnixTimestamp') @allow_non_gpu('ProjectExec') @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @@ -262,7 +419,6 @@ def test_unsupported_fallback_to_unix_timestamp(data_gen): "to_unix_timestamp(a, b)"), "ToUnixTimestamp") - @pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) def test_from_utc_timestamp(data_gen, time_zone): @@ -329,7 +485,8 @@ def fun(spark): @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) # first get expected string via `date_format` -def test_string_to_timestamp_functions_ansi_valid(parser_policy): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_string_to_timestamp_functions_ansi_valid_for_utc(parser_policy): expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM', 'MM/yyyy', 'MM-yyyy', 'MM/dd/yyyy', 'MM-dd-yyyy'] @@ -344,30 +501,84 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun, conf=copy_and_update(parser_policy_dic, ansi_enabled_conf)) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) +def test_string_to_timestamp_functions_ansi_valid_for_non_utc(parser_policy): + expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" + formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', + 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM', 'MM/yyyy', 'MM-yyyy', 'MM/dd/yyyy', 'MM-dd-yyyy'] + operators = ["to_unix_timestamp", "unix_timestamp", "to_timestamp", "to_date"] + format_operator_pairs = [(fmt, operator) for fmt in formats for operator in operators] + expr_list = [expr_format.format(operator=operator, fmt=fmt) for (fmt, operator) in format_operator_pairs] + parser_policy_dic = {"spark.sql.legacy.timeParserPolicy": "{}".format(parser_policy)} + + def fun(spark): + df = spark.createDataFrame([(datetime(1970, 8, 12, tzinfo=timezone.utc),)], "a timestamp") + return df.selectExpr(expr_list) + + assert_gpu_fallback_collect(fun, 'UnixTimestamp') + @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -def test_unix_timestamp_improved(data_gen, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_unix_timestamp_improved_for_utc(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", "spark.sql.legacy.timeParserPolicy": "CORRECTED"} assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))), copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_unix_timestamp_improved_for_non_utc(data_gen, ansi_enabled): + conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", + "spark.sql.legacy.timeParserPolicy": "CORRECTED"} + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a'))), + 'UnixTimestamp', + copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) + @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -def test_unix_timestamp(data_gen, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_unix_timestamp_for_utc(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_unix_timestamp_for_non_utc(data_gen, ansi_enabled): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), + 'UnixTimestamp', + {'spark.sql.ansi.enabled': ansi_enabled}) + @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -def test_to_unix_timestamp_improved(data_gen, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_to_unix_timestamp_improved_for_utc(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_to_unix_timestamp_improved_for_non_utc(data_gen, ansi_enabled): + conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), + 'ToUnixTimestamp', + copy_and_update({'spark.sql.ansi.enabled': ansi_enabled}, conf)) + str_date_and_format_gen = [pytest.param(StringGen('[0-9]{4}/[01][0-9]'),'yyyy/MM', marks=pytest.mark.xfail(reason="cudf does no checks")), (StringGen('[0-9]{4}/[01][12]/[0-2][1-8]'),'yyyy/MM/dd'), (StringGen('[01][12]/[0-2][1-8]'), 'MM/dd'), @@ -380,11 +591,22 @@ def invalid_date_string_df(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -def test_string_to_unix_timestamp(data_gen, date_form, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_string_to_unix_timestamp_for_utc(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +def test_string_to_unix_timestamp_for_non_utc(data_gen, date_form, ansi_enabled): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), + 'ToUnixTimestamp', + {'spark.sql.ansi.enabled': ansi_enabled}) + def test_string_to_unix_timestamp_ansi_exception(): assert_gpu_and_cpu_error( lambda spark : invalid_date_string_df(spark).selectExpr("to_unix_timestamp(a, '{}')".format('yyyy/MM/dd')).collect(), @@ -393,11 +615,22 @@ def test_string_to_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) -def test_string_unix_timestamp(data_gen, date_form, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_string_unix_timestamp_for_utc(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +def test_string_unix_timestamp_for_non_utc(data_gen, date_form, ansi_enabled): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), + 'UnixTimestamp', + {'spark.sql.ansi.enabled': ansi_enabled}) + def test_string_unix_timestamp_ansi_exception(): assert_gpu_and_cpu_error( lambda spark : invalid_date_string_df(spark).select(f.unix_timestamp(f.col('a'), 'yyyy/MM/dd')).collect(), @@ -406,17 +639,36 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) -def test_gettimestamp(data_gen, ansi_enabled): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_gettimestamp_for_utc(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) +@pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +def test_gettimestamp_for_non_utc(data_gen, ansi_enabled): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), + 'RDDScanExec', + {'spark.sql.ansi.enabled': ansi_enabled}) @pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) -def test_gettimestamp_format_MMyyyy(data_gen): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_gettimestamp_format_MMyyyy_for_utc(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) +def test_gettimestamp_format_MMyyyy_for_non_utc(data_gen): + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy")), + 'RDDScanExec') + def test_gettimestamp_ansi_exception(): assert_gpu_and_cpu_error( lambda spark : invalid_date_string_df(spark).select(f.to_date(f.col("a"), "yyyy-MM-dd")).collect(), @@ -427,10 +679,20 @@ def test_gettimestamp_ansi_exception(): 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM'] @pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -def test_date_format(data_gen, date_format): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_date_format_for_utc(data_gen, date_format): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_date_format_for_non_utc(data_gen, date_format): + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format)), + 'DateFormatClass') + unsupported_date_formats = ['F'] @pytest.mark.parametrize('date_format', unsupported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @@ -461,18 +723,30 @@ def test_date_format_maybe(data_gen, date_format): @pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -def test_date_format_maybe_incompat(data_gen, date_format): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_date_format_maybe_incompat_for_utc(data_gen, date_format): conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format)), conf) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) +@pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +def test_date_format_maybe_incompat_for_non_utc(data_gen, date_format): + conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format)), + 'DateFormatClass', conf) + # Reproduce conditions for https://github.com/NVIDIA/spark-rapids/issues/5670 # where we had a failure due to GpuCast canonicalization with timezone. # In this case it was doing filter after project, the way I get that to happen is by adding in the # input_file_name(), otherwise filter happens before project. @allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec') @ignore_order() -def test_date_format_mmyyyy_cast_canonicalization(spark_tmp_path): +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +def test_date_format_mmyyyy_cast_canonicalization_for_utc(spark_tmp_path): data_path = spark_tmp_path + '/CSV_DATA' gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) schema = gen.data_type @@ -487,6 +761,24 @@ def do_join_cast(spark): return left.join(right, left.monthly_reporting_period == right.r_monthly_reporting_period, how='inner') assert_gpu_and_cpu_are_equal_collect(do_join_cast) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec', 'ProjectExec') +@ignore_order() +def test_date_format_mmyyyy_cast_canonicalization_for_non_utc(spark_tmp_path): + data_path = spark_tmp_path + '/CSV_DATA' + gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) + schema = gen.data_type + with_cpu_session(lambda spark : gen_df(spark, gen, length=100).write.csv(data_path)) + def do_join_cast(spark): + left = spark.read.csv(data_path)\ + .selectExpr("date_format(to_date(_c0, 'MMyyyy'), 'MM/dd/yyyy') as monthly_reporting_period", "substring_index(substring_index(input_file_name(),'/',-1),'.',1) as filename") + right = spark.read.csv(data_path).withColumnRenamed("_c0", "r_c0")\ + .selectExpr("date_format(to_date(r_c0, 'MMyyyy'), 'MM/dd/yyyy') as monthly_reporting_period", "substring_index(substring_index(input_file_name(),'/',-1),'.',1) as filename")\ + .withColumnRenamed("monthly_reporting_period", "r_monthly_reporting_period")\ + .withColumnRenamed("filename", "r_filename") + return left.join(right, left.monthly_reporting_period == right.r_monthly_reporting_period, how='inner') + assert_gpu_fallback_collect(do_join_cast, 'GetTimestamp') + @allow_non_gpu('ProjectExec') @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) @@ -558,3 +850,91 @@ def test_timestamp_millis_long_overflow(): def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) + + +# used by timezone test cases +def get_timezone_df(spark): + schema = StructType([ + StructField("ts_str_col", StringType()), + StructField("long_col", LongType()), + StructField("ts_col", TimestampType()), + StructField("date_col", DateType()), + StructField("date_str_col", StringType()), + ]) + data = [ + ('1970-01-01 00:00:00', 0, datetime(1971, 1, 1), date(1971, 1, 1), '1971-01-01'), + ('1970-01-01 00:00:00', 0, datetime(1971, 1, 1), date(1971, 1, 1), '1971-01-01'), + ] + return spark.createDataFrame(SparkContext.getOrCreate().parallelize(data),schema) + +# used by timezone test cases, specify all the sqls that will be impacted by non-utc timezone +time_zone_sql_conf_pairs = [ + ("select minute(ts_col) from tab", {}), + ("select second(ts_col) from tab", {}), + ("select hour(ts_col) from tab", {}), + ("select date_col + (interval 10 days 3 seconds) from tab", {}), + ("select date_format(ts_col, 'yyyy-MM-dd HH:mm:ss') from tab", {}), + ("select unix_timestamp(ts_col) from tab", {"spark.rapids.sql.improvedTimeOps.enabled": "true"}), + ("select to_unix_timestamp(ts_str_col) from tab", {"spark.rapids.sql.improvedTimeOps.enabled": "false"}), + ("select to_unix_timestamp(ts_col) from tab", {"spark.rapids.sql.improvedTimeOps.enabled": "true"}), + ("select to_date(date_str_col, 'yyyy-MM-dd') from tab", {}), # test GpuGetTimestamp + ("select to_date(date_str_col) from tab", {}), + ("select from_unixtime(long_col, 'yyyy-MM-dd HH:mm:ss') from tab", {}), + ("select cast(ts_col as string) from tab", {}), # cast + ("select cast(ts_col as date) from tab", {}), # cast + ("select cast(date_col as TIMESTAMP) from tab", {}), # cast + ("select to_timestamp(ts_str_col) from tab", {"spark.rapids.sql.improvedTimeOps.enabled": "false"}), + ("select to_timestamp(ts_str_col) from tab", {"spark.rapids.sql.improvedTimeOps.enabled": "true"}), + ] + +timezone_conf = {"spark.sql.session.timeZone": "UTC", + "spark.rapids.sql.hasExtendedYearValues": "false", + "spark.rapids.sql.castStringToTimestamp.enabled": "true",} + +@allow_non_gpu("ProjectExec") +@pytest.mark.parametrize('sql, extra_conf', time_zone_sql_conf_pairs) +def test_timezone_for_operators_with_non_utc(sql, extra_conf): + all_conf = copy_and_update(timezone_conf, extra_conf) + def gen_sql_df(spark): + df = get_timezone_df(spark) + df.createOrReplaceTempView("tab") + return spark.sql(sql) + assert_gpu_fallback_collect(gen_sql_df, "ProjectExec", all_conf) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@pytest.mark.parametrize('sql, conf', time_zone_sql_conf_pairs) +def test_timezone_for_operators_with_utc(sql, conf): + conf = copy_and_update(timezone_conf, conf) + def gen_sql_df(spark): + df = get_timezone_df(spark) + df.createOrReplaceTempView("tab") + return spark.sql(sql) + assert_gpu_and_cpu_are_equal_collect(gen_sql_df, conf) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non UTC TZ") +@allow_non_gpu("ProjectExec") +def test_timezone_for_operator_from_utc_timestamp_with_non_utc(): + # timezone is non-utc, should fallback to CPU + def gen_sql_df(spark): + df = get_timezone_df(spark) + df.createOrReplaceTempView("tab") + return spark.sql("select from_utc_timestamp(ts_col, 'Asia/Shanghai') from tab") + assert_gpu_fallback_collect(gen_sql_df, "ProjectExec") + + +def test_timezone_for_operator_from_utc_timestamp_with_utc(): + # timezone is utc, should be supported by GPU + def gen_sql_df(spark): + df = get_timezone_df(spark) + df.createOrReplaceTempView("tab") + return spark.sql("select from_utc_timestamp(ts_col, '+00:00') from tab").collect() + with_gpu_session(gen_sql_df) + + +def test_timezone_from_utc_timestamp(): + conf = {} + def gen_sql_df(spark): + df = get_timezone_df(spark) + df.createOrReplaceTempView("tab") + return spark.sql("select from_utc_timestamp(ts_col, 'Iran') from tab") + assert_gpu_and_cpu_are_equal_collect(gen_sql_df, conf) \ No newline at end of file diff --git a/integration_tests/src/main/python/delta_lake_auto_compact_test.py b/integration_tests/src/main/python/delta_lake_auto_compact_test.py index 17c03a105c9..25fede5afed 100644 --- a/integration_tests/src/main/python/delta_lake_auto_compact_test.py +++ b/integration_tests/src/main/python/delta_lake_auto_compact_test.py @@ -14,7 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_writes_are_equal_collect, with_cpu_session, with_gpu_session -from data_gen import copy_and_update +from data_gen import * from delta_lake_utils import delta_meta_allow from marks import allow_non_gpu, delta_lake from pyspark.sql.functions import * @@ -65,7 +65,7 @@ def read_data(spark, table_path): write_func=write_to_delta(is_partitioned=False), read_func=read_data, base_path=data_path, - conf=_conf) + conf=copy_and_update(_conf, writer_confs_for_DB)) def read_metadata(spark, table_path): input_table = DeltaTable.forPath(spark, table_path) @@ -84,7 +84,7 @@ def read_metadata(spark, table_path): write_func=lambda spark, table_path: None, # Already written. read_func=read_metadata, base_path=data_path, - conf=conf_enable_auto_compact) + conf=copy_and_update(conf_enable_auto_compact, writer_confs_for_DB)) @delta_lake @@ -113,7 +113,7 @@ def read_data(spark, table_path): write_func=write_to_delta(is_partitioned=True), read_func=read_data, base_path=data_path, - conf=_conf) + conf=copy_and_update(_conf, writer_confs_for_DB)) def read_metadata(spark, table_path): """ @@ -136,7 +136,7 @@ def read_metadata(spark, table_path): write_func=lambda spark, table_path: None, # Already written. read_func=read_metadata, base_path=data_path, - conf=conf_enable_auto_compact) + conf=copy_and_update(conf_enable_auto_compact, writer_confs_for_DB)) @delta_lake @@ -158,7 +158,7 @@ def test_auto_compact_disabled(spark_tmp_path, auto_compact_conf): writer = write_to_delta(num_writes=10) with_gpu_session(func=lambda spark: writer(spark, data_path), - conf=disable_auto_compaction) + conf=copy_and_update(disable_auto_compaction, writer_confs_for_DB)) # 10 writes should correspond to 10 commits. # (i.e. there should be no OPTIMIZE commits.) @@ -191,11 +191,12 @@ def test_auto_compact_min_num_files(spark_tmp_path): 'spark.databricks.delta.autoCompact.minNumFiles': 5 # Num files before compaction. } + conf = copy_and_update(enable_auto_compaction_on_5, writer_confs_for_DB) # Minimum number of input files == 5. # If 4 files are written, there should be no OPTIMIZE. writer = write_to_delta(num_writes=4) with_gpu_session(func=lambda spark: writer(spark, data_path), - conf=enable_auto_compaction_on_5) + conf=conf) def verify_table_history_before_limit(spark): input_table = DeltaTable.forPath(spark, data_path) @@ -210,7 +211,7 @@ def verify_table_history_before_limit(spark): # On the 5th file write, auto-OPTIMIZE should kick in. with_gpu_session(func=lambda spark: write_to_delta(num_writes=1)(spark, data_path), - conf=enable_auto_compaction_on_5) + conf=conf) def verify_table_history_after_limit(spark): input_table = DeltaTable.forPath(spark, data_path) diff --git a/integration_tests/src/main/python/delta_lake_delete_test.py b/integration_tests/src/main/python/delta_lake_delete_test.py index 413479b3a12..11ef698b1eb 100644 --- a/integration_tests/src/main/python/delta_lake_delete_test.py +++ b/integration_tests/src/main/python/delta_lake_delete_test.py @@ -37,7 +37,7 @@ def do_delete(spark, path): def assert_delta_sql_delete_collect(spark_tmp_path, use_cdf, dest_table_func, delete_sql, partition_columns=None, - conf=delta_delete_enabled_conf, + conf=copy_and_update(delta_delete_enabled_conf, writer_confs_for_DB), skip_sql_result_check=False): def read_data(spark, path): read_func = read_delta_path_with_cdf if use_cdf else read_delta_path @@ -187,5 +187,5 @@ def do_delete(spark, path): dest_table.delete("b > 'c'") read_func = read_delta_path_with_cdf if use_cdf else read_delta_path assert_gpu_and_cpu_writes_are_equal_collect(do_delete, read_func, data_path, - conf=delta_delete_enabled_conf) + conf=copy_and_update(delta_delete_enabled_conf, writer_confs_for_DB)) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) diff --git a/integration_tests/src/main/python/delta_lake_merge_test.py b/integration_tests/src/main/python/delta_lake_merge_test.py index 1d43259434b..00f46032e27 100644 --- a/integration_tests/src/main/python/delta_lake_merge_test.py +++ b/integration_tests/src/main/python/delta_lake_merge_test.py @@ -52,7 +52,7 @@ def do_merge(spark, path): def assert_delta_sql_merge_collect(spark_tmp_path, spark_tmp_table_factory, use_cdf, src_table_func, dest_table_func, merge_sql, compare_logs, partition_columns=None, - conf=delta_merge_enabled_conf): + conf=copy_and_update(delta_merge_enabled_conf, writer_confs_for_DB)): def read_data(spark, path): read_func = read_delta_path_with_cdf if use_cdf else read_delta_path df = read_func(spark, path) @@ -142,7 +142,7 @@ def test_delta_merge_partial_fallback_via_conf(spark_tmp_path, spark_tmp_table_f " WHEN MATCHED THEN UPDATE SET d.a = s.a + 4 WHEN NOT MATCHED THEN INSERT *" # Non-deterministic input for each task means we can only reliably compare record counts when using only one task compare_logs = num_slices == 1 - conf = copy_and_update(delta_merge_enabled_conf, { disable_conf: "false" }) + conf = copy_and_update(delta_merge_enabled_conf, { disable_conf: "false" }, writer_confs_for_DB) assert_delta_sql_merge_collect(spark_tmp_path, spark_tmp_table_factory, use_cdf, src_table_func, dest_table_func, merge_sql, compare_logs, partition_columns, conf=conf) @@ -295,7 +295,7 @@ def do_merge(spark, path): .whenNotMatchedInsertAll() \ .execute() read_func = read_delta_path_with_cdf if use_cdf else read_delta_path - assert_gpu_and_cpu_writes_are_equal_collect(do_merge, read_func, data_path, conf=delta_merge_enabled_conf) + assert_gpu_and_cpu_writes_are_equal_collect(do_merge, read_func, data_path, conf=copy_and_update(delta_merge_enabled_conf, writer_confs_for_DB)) # Non-deterministic input for each task means we can only reliably compare record counts when using only one task if num_slices == 1: with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) diff --git a/integration_tests/src/main/python/delta_lake_update_test.py b/integration_tests/src/main/python/delta_lake_update_test.py index 0fc65658332..03b481ab6e2 100644 --- a/integration_tests/src/main/python/delta_lake_update_test.py +++ b/integration_tests/src/main/python/delta_lake_update_test.py @@ -38,7 +38,7 @@ def do_update(spark, path): def assert_delta_sql_update_collect(spark_tmp_path, use_cdf, dest_table_func, update_sql, partition_columns=None, enable_deletion_vectors=False, - conf=delta_update_enabled_conf): + conf=copy_and_update(delta_update_enabled_conf, writer_confs_for_DB)): def read_data(spark, path): read_func = read_delta_path_with_cdf if use_cdf else read_delta_path df = read_func(spark, path) @@ -175,7 +175,7 @@ def do_update(spark, path): dest_table.update(condition="b > 'c'", set={"c": f.col("b"), "a": f.lit(1)}) read_func = read_delta_path_with_cdf if use_cdf else read_delta_path assert_gpu_and_cpu_writes_are_equal_collect(do_update, read_func, data_path, - conf=delta_update_enabled_conf) + conf=copy_and_update(delta_update_enabled_conf, writer_confs_for_DB)) # Databricks not guaranteed to write the same number of files due to optimized write when # using partitions if not is_databricks_runtime() or not partition_columns: diff --git a/integration_tests/src/main/python/delta_lake_write_test.py b/integration_tests/src/main/python/delta_lake_write_test.py index 3997ae3eba3..57dd2197ae2 100644 --- a/integration_tests/src/main/python/delta_lake_write_test.py +++ b/integration_tests/src/main/python/delta_lake_write_test.py @@ -516,16 +516,17 @@ def test_delta_write_legacy_format_fallback(spark_tmp_path): def test_delta_write_append_only(spark_tmp_path): data_path = spark_tmp_path + "/DELTA_DATA" gen = int_gen + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) # setup initial table with_gpu_session(lambda spark: unary_op_df(spark, gen).coalesce(1).write.format("delta") .option("delta.appendOnly", "true") .save(data_path), - conf=delta_writes_enabled_conf) + conf=conf) # verify overwrite fails assert_spark_exception( lambda: with_gpu_session( lambda spark: unary_op_df(spark, gen).write.format("delta").mode("overwrite").save(data_path), - conf=delta_writes_enabled_conf), + conf=conf), "This table is configured to only allow appends") @allow_non_gpu(*delta_meta_allow) @@ -541,16 +542,16 @@ def setup_table(spark): spark.sql("CREATE TABLE delta.`{}` (a string NOT NULL) USING DELTA".format(data_path)) with_cpu_session(setup_table) - + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) # verify write of non-null values does not throw with_gpu_session(lambda spark: unary_op_df(spark, not_null_gen).write.format("delta").mode("append").save(data_path), - conf=delta_writes_enabled_conf) + conf=conf) # verify write of null value throws assert_spark_exception( lambda: with_gpu_session( lambda spark: unary_op_df(spark, null_gen).write.format("delta").mode("append").save(data_path), - conf=delta_writes_enabled_conf), + conf=conf), "NOT NULL constraint violated for column: a") @allow_non_gpu(*delta_meta_allow) @@ -570,8 +571,9 @@ def setup_table(spark): def gen_good_data(spark): return spark.range(1024).withColumn("x", f.col("id") + 1) + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) with_gpu_session(lambda spark: gen_good_data(spark).write.format("delta").mode("append").save(data_path), - conf=delta_writes_enabled_conf) + conf=conf) # verify write of values that violate the constraint throws def gen_bad_data(spark): @@ -580,7 +582,7 @@ def gen_bad_data(spark): assert_spark_exception( lambda: with_gpu_session( lambda spark: gen_bad_data(spark).write.format("delta").mode("append").save(data_path), - conf=delta_writes_enabled_conf), + conf=conf), "CHECK constraint customcheck (id < x) violated") @allow_non_gpu(*delta_meta_allow) @@ -594,7 +596,7 @@ def setup_table(spark): spark.sql("ALTER TABLE delta.`{}` ADD CONSTRAINT mycheck CHECK (id + x < 1000)".format(data_path)) with_cpu_session(setup_table) # create a conf that will force constraint check to fallback to CPU - add_disable_conf = copy_and_update(delta_writes_enabled_conf, {"spark.rapids.sql.expression.Add": "false"}) + add_disable_conf = copy_and_update(delta_writes_enabled_conf, {"spark.rapids.sql.expression.Add": "false"}, writer_confs_for_DB) # verify write of dataframe that passes constraint check does not fail def gen_good_data(spark): return spark.range(100).withColumn("x", f.col("id") + 1) @@ -617,7 +619,7 @@ def gen_bad_data(spark): @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x") def test_delta_write_stat_column_limits(num_cols, spark_tmp_path): data_path = spark_tmp_path + "/DELTA_DATA" - confs = copy_and_update(delta_writes_enabled_conf, {"spark.databricks.io.skipping.stringPrefixLength": 8}) + confs = copy_and_update(delta_writes_enabled_conf, {"spark.databricks.io.skipping.stringPrefixLength": 8}, writer_confs_for_DB) strgen = StringGen() \ .with_special_case((chr(sys.maxunicode) * 7) + "abc") \ .with_special_case((chr(sys.maxunicode) * 8) + "abc") \ @@ -654,11 +656,12 @@ def write_data(spark, path): df.write.format("delta").mode("append").save(path) data_path = spark_tmp_path + "/DELTA_DATA" + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) assert_gpu_and_cpu_writes_are_equal_collect( write_data, lambda spark, path: spark.read.format("delta").load(path), data_path, - conf=delta_writes_enabled_conf) + conf=conf) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) @allow_non_gpu("CreateTableExec", *delta_meta_allow) @@ -671,11 +674,12 @@ def test_delta_write_identity_columns(spark_tmp_path): def create_data(spark, path): spark.sql("CREATE TABLE delta.`{}` (x BIGINT, id BIGINT GENERATED ALWAYS AS IDENTITY) USING DELTA".format(path)) spark.range(2048).selectExpr("id * id AS x").write.format("delta").mode("append").save(path) + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) assert_gpu_and_cpu_writes_are_equal_collect( create_data, lambda spark, path: spark.read.format("delta").load(path), data_path, - conf=delta_writes_enabled_conf) + conf=conf) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) def append_data(spark, path): spark.range(2048).selectExpr("id + 10 as x").write.format("delta").mode("append").save(path) @@ -683,7 +687,7 @@ def append_data(spark, path): append_data, lambda spark, path: spark.read.format("delta").load(path), data_path, - conf=delta_writes_enabled_conf) + conf=conf) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) @@ -704,11 +708,12 @@ def create_data(spark, path): "id5 BIGINT GENERATED ALWAYS AS IDENTITY ( START WITH 12 INCREMENT BY -3 )" ") USING DELTA") spark.range(2048).selectExpr("id * id AS x").write.format("delta").mode("append").save(path) + conf = copy_and_update(delta_writes_enabled_conf, writer_confs_for_DB) assert_gpu_and_cpu_writes_are_equal_collect( create_data, lambda spark, path: spark.read.format("delta").load(path), data_path, - conf=delta_writes_enabled_conf) + conf=conf) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) def append_data(spark, path): spark.range(2048).selectExpr("id + 10 as x").write.format("delta").mode("append").save(path) @@ -716,7 +721,7 @@ def append_data(spark, path): append_data, lambda spark, path: spark.read.format("delta").load(path), data_path, - conf=delta_writes_enabled_conf) + conf=conf) with_cpu_session(lambda spark: assert_gpu_and_cpu_delta_logs_equivalent(spark, data_path)) @allow_non_gpu(*delta_meta_allow, "ExecutedCommandExec") @@ -782,7 +787,7 @@ def test_delta_write_auto_optimize_sql_conf_fallback(confkey, spark_tmp_path): @pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x") def test_delta_write_aqe_join(spark_tmp_path): data_path = spark_tmp_path + "/DELTA_DATA" - confs=copy_and_update(delta_writes_enabled_conf, {"spark.sql.adaptive.enabled": "true"}) + confs=copy_and_update(delta_writes_enabled_conf, {"spark.sql.adaptive.enabled": "true"}, writer_confs_for_DB) def do_join(spark, path): df = unary_op_df(spark, int_gen) df.join(df, ["a"], "inner").write.format("delta").save(path) @@ -805,10 +810,12 @@ def do_join(spark, path): def test_delta_write_optimized_aqe(spark_tmp_path, enable_conf_key, aqe_enabled): num_chunks = 20 def do_write(data_path, is_optimize_write): - confs=copy_and_update(delta_writes_enabled_conf, { - enable_conf_key : str(is_optimize_write), - "spark.sql.adaptive.enabled" : str(aqe_enabled) - }) + confs=copy_and_update(delta_writes_enabled_conf, + { + enable_conf_key : str(is_optimize_write), + "spark.sql.adaptive.enabled" : str(aqe_enabled) + }, + writer_confs_for_DB) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, int_gen)\ .repartition(num_chunks).write.format("delta").save(path), @@ -909,16 +916,16 @@ def do_write(confs): lambda spark, path: spark.read.format("delta").load(path), data_path, conf=confs) - confs=copy_and_update(delta_writes_enabled_conf, { - "spark.databricks.delta.optimizeWrite.enabled" : "true" - }) + confs=copy_and_update(delta_writes_enabled_conf, + {"spark.databricks.delta.optimizeWrite.enabled" : "true"}, + writer_confs_for_DB) do_write(confs) opmetrics = get_last_operation_metrics(gpu_data_path) assert int(opmetrics["numFiles"]) == 1 # Verify SQL conf takes precedence over table setting - confs=copy_and_update(delta_writes_enabled_conf, { - "spark.databricks.delta.optimizeWrite.enabled" : "false" - }) + confs=copy_and_update(delta_writes_enabled_conf, + {"spark.databricks.delta.optimizeWrite.enabled" : "false"}, + writer_confs_for_DB) do_write(confs) opmetrics = get_last_operation_metrics(gpu_data_path) assert int(opmetrics["numFiles"]) == num_chunks @@ -927,9 +934,9 @@ def do_prop_update(spark): spark.sql("ALTER TABLE delta.`{}`".format(gpu_data_path) + " SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true)") with_cpu_session(do_prop_update) - confs=copy_and_update(delta_writes_enabled_conf, { - "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite" : "false" - }) + confs=copy_and_update(delta_writes_enabled_conf, + {"spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite" : "false"}, + writer_confs_for_DB) do_write(confs) opmetrics = get_last_operation_metrics(gpu_data_path) assert int(opmetrics["numFiles"]) == 1 @@ -951,16 +958,16 @@ def do_write(confs): lambda spark, path: spark.read.format("delta").load(path), data_path, conf=confs) - confs=copy_and_update(delta_writes_enabled_conf, { - "spark.databricks.delta.optimizeWrite.enabled" : "false" - }) + confs=copy_and_update(delta_writes_enabled_conf, + {"spark.databricks.delta.optimizeWrite.enabled" : "false"}, + writer_confs_for_DB) do_write(confs) opmetrics = get_last_operation_metrics(gpu_data_path) assert int(opmetrics["numFiles"]) == 2 * num_chunks # Verify SQL conf takes precedence over table setting - confs=copy_and_update(delta_writes_enabled_conf, { - "spark.databricks.delta.optimizeWrite.enabled" : "true" - }) + confs=copy_and_update(delta_writes_enabled_conf, + {"spark.databricks.delta.optimizeWrite.enabled" : "true"}, + writer_confs_for_DB) do_write(confs) opmetrics = get_last_operation_metrics(gpu_data_path) assert int(opmetrics["numFiles"]) == 2 diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py index b84754a3d3f..193c1632db3 100644 --- a/integration_tests/src/main/python/explain_test.py +++ b/integration_tests/src/main/python/explain_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import pytest from data_gen import * +from conftest import is_not_utc, is_utc from marks import * from pyspark.sql.functions import * from pyspark.sql.types import * @@ -49,6 +50,7 @@ def do_join_explain(spark): with_cpu_session(do_join_explain) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") def test_explain_set_config(): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} @@ -68,6 +70,27 @@ def do_explain(spark): with_cpu_session(do_explain) +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@allow_non_gpu("ProjectExec") +def test_explain_set_config_for_non_utc(): + conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', + 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} + + def do_explain(spark): + df = unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(TimestampType())) + # a bit brittle if these get turned on by default + spark.conf.set('spark.rapids.sql.hasExtendedYearValues', 'false') + spark.conf.set('spark.rapids.sql.castStringToTimestamp.enabled', 'true') + explain_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df._jdf, "ALL") + print(explain_str) + assert "timestamp) cannot run on GPU" in explain_str + spark.conf.set('spark.rapids.sql.castStringToTimestamp.enabled', 'false') + explain_str_cast_off = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df._jdf, "ALL") + print(explain_str_cast_off) + assert "timestamp) cannot run on GPU" in explain_str_cast_off + + with_cpu_session(do_explain) + def test_explain_udf(): slen = udf(lambda s: len(s), IntegerType()) diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index cf79b996514..7c4555867aa 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -176,9 +176,9 @@ def do_test(spark): (from_cpu, cpu_df), (from_gpu, gpu_df) = run_with_cpu_and_gpu( do_test, 'COLLECT_WITH_DATAFRAME', - conf={ + conf=copy_and_update(writer_confs_for_DB, { 'spark.sql.ansi.enabled': 'true', - 'spark.sql.storeAssignmentPolicy': 'ANSI'}) + 'spark.sql.storeAssignmentPolicy': 'ANSI'})) jvm = spark_jvm() jvm.org.apache.spark.sql.rapids.ExecutionPlanCaptureCallback.assertContainsAnsiCast(cpu_df._jdf) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 9efd6bcca4b..d4c243aa2ea 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -19,7 +19,7 @@ assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture from data_gen import * from datetime import timezone -from conftest import is_databricks_runtime +from conftest import is_databricks_runtime, is_not_utc, is_utc from marks import approximate_float, allow_non_gpu, ignore_order from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_340, \ is_before_spark_341 @@ -182,6 +182,8 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li "'T'HH:mm[:ss]", "'T'HH:mm"] + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) @@ -203,23 +205,66 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena .json(data_path), conf=updated_conf) -@allow_non_gpu('FileSourceScanExec', 'ProjectExec') +@allow_non_gpu('FileSourceScanExec', 'BatchScanExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@pytest.mark.parametrize('ts_part', json_supported_ts_parts) +@pytest.mark.parametrize('date_format', json_supported_date_formats) +@pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +def test_json_ts_formats_round_trip_for_non_utc(spark_tmp_path, date_format, ts_part, + v1_enabled_list): + full_format = date_format + ts_part + data_gen = TimestampGen() + gen = StructGen([('a', data_gen)], nullable=False) + data_path = spark_tmp_path + '/JSON_DATA' + schema = gen.data_type + with_cpu_session( + lambda spark: gen_df(spark, gen).write \ + .option('timestampFormat', full_format).json(data_path)) + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + cpu_fallback_class_name = "FileSourceScanExec" if v1_enabled_list == "json" else "BatchScanExec" + assert_gpu_fallback_collect( + lambda spark: spark.read.schema(schema)\ + .option('timestampFormat', full_format).json(data_path), + cpu_fallback_class_name, + conf=updated_conf) + + +@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'BatchScanExec') +@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') +@pytest.mark.parametrize('ts_part', json_supported_ts_parts) +@pytest.mark.parametrize('date_format', json_supported_date_formats) +@pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +def test_json_ts_formats_round_trip_ntz_fallback(spark_tmp_path, date_format, ts_part, + v1_enabled_list): + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, 'TIMESTAMP_NTZ', + v1_enabled_list, False) + + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) -@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): - json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') +@pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +def test_json_ts_formats_round_trip_ltz(spark_tmp_path, date_format, ts_part, v1_enabled_list): + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, 'TIMESTAMP_LTZ', + v1_enabled_list, False) -@allow_non_gpu('BatchScanExec', 'ProjectExec') + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@allow_non_gpu('BatchScanExec', 'ProjectExec', 'FileSourceScanExec') @pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+') @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) -@pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) -def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): - json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') +@pytest.mark.parametrize('v1_list', ["", "json"]) +def test_json_ts_formats_round_trip_ltz_for_non_utc(spark_tmp_path, date_format, ts_part, v1_list): + json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, 'TIMESTAMP_LTZ', + v1_list, True) + -def json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, v1_enabled_list, cpu_scan_class): +def json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, + v1_enabled_list, force_fallback_test): + cpu_scan_class = "FileSourceScanExec" if v1_enabled_list == "json" else "BatchScanExec" full_format = date_format + ts_part data_gen = TimestampGen(tzinfo=None if timestamp_type == "TIMESTAMP_NTZ" else timezone.utc) gen = StructGen([('a', data_gen)], nullable=False) @@ -242,7 +287,7 @@ def do_read(spark): .json(data_path) - if timestamp_type == "TIMESTAMP_LTZ": + if timestamp_type == "TIMESTAMP_LTZ" and not force_fallback_test: assert_cpu_and_gpu_are_equal_collect_with_capture( lambda spark : do_read(spark), exist_classes = 'Gpu' + cpu_scan_class, @@ -250,6 +295,7 @@ def do_read(spark): conf=updated_conf) else: # we fall back to CPU due to "unsupported data types in output: TimestampNTZType" + # or unsupported timezone assert_gpu_fallback_collect( lambda spark : do_read(spark), cpu_fallback_class_name = cpu_scan_class, @@ -383,6 +429,8 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an else: assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @approximate_float @pytest.mark.parametrize('filename', [ 'timestamps.json', @@ -403,6 +451,25 @@ def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, f = read_func(std_input_path + '/' + filename, schema, spark_tmp_table_factory, {}) assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@approximate_float +@allow_non_gpu('FileSourceScanExec') +@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) +@pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +@pytest.mark.parametrize('time_parser_policy', ['LEGACY', 'CORRECTED', 'EXCEPTION']) +def test_json_read_valid_timestamps_for_non_utc(std_input_path, read_func, ansi_enabled, + time_parser_policy, spark_tmp_table_factory): + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.ansi.enabled': ansi_enabled, + 'spark.sql.legacy.timeParserPolicy': time_parser_policy}) + test_f = read_func(std_input_path + '/timestamps.json', _timestamp_schema, + spark_tmp_table_factory) + assert_gpu_fallback_collect(test_f, + cpu_fallback_class_name='FileSourceScanExec', + conf=updated_conf) + + @pytest.mark.parametrize('schema', [_string_schema]) @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @pytest.mark.parametrize('allow_unquoted_chars', ["true"]) @@ -452,6 +519,8 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list): lambda spark : spark.read.schema(schema).json(data_path), conf=updated_conf) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") def test_from_json_map(): # The test here is working around some inconsistencies in how the keys are parsed for maps # on the GPU the keys are dense, but on the CPU they are sparse @@ -461,6 +530,19 @@ def test_from_json_map(): .select(f.from_json(f.col('a'), 'MAP')), conf={"spark.rapids.sql.expression.JsonToStructs": True}) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@allow_non_gpu('ProjectExec') +def test_from_json_map_for_non_utc(): + # The test here is working around some inconsistencies in how the keys are parsed for maps + # on the GPU the keys are dense, but on the CPU they are sparse + json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, json_string_gen) + .select(f.from_json(f.col('a'), 'MAP')), + conf={"spark.rapids.sql.expression.JsonToStructs": True}) + + @allow_non_gpu('ProjectExec', 'JsonToStructs') def test_from_json_map_fallback(): # The test here is working around some inconsistencies in how the keys are parsed for maps @@ -547,7 +629,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): all_confs = {'spark.sql.sources.useV1SourceList': v1_enabled_list, 'spark.rapids.sql.format.json.read.enabled': True, 'spark.rapids.sql.format.json.enabled': True} - gen_list =[('k0', LongGen(nullable=False, min_val=0, max_val=0)), + gen_list =[('k0', LongGen(nullable=False, min_val=0, max_val=0)), ('k1', LongGen(nullable=False, min_val=1, max_val=1)), ('k2', LongGen(nullable=False, min_val=2, max_val=2)), ('k3', LongGen(nullable=False, min_val=3, max_val=3)), @@ -555,7 +637,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): ('v1', LongGen()), ('v2', LongGen()), ('v3', LongGen())] - + gen = StructGen(gen_list, nullable=False) data_path = spark_tmp_path + '/JSON_DATA' with_cpu_session( @@ -566,6 +648,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") @pytest.mark.parametrize('data_gen', [byte_gen, boolean_gen, short_gen, @@ -609,4 +692,31 @@ def struct_to_json(spark): assert_gpu_and_cpu_are_equal_collect( lambda spark : struct_to_json(spark), - conf=conf) \ No newline at end of file + conf=conf) + + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC") +@pytest.mark.parametrize('data_gen', [byte_gen, boolean_gen, short_gen, int_gen, long_gen, + float_gen, double_gen, date_gen, timestamp_gen, + string_gen], ids=idfn) +@pytest.mark.parametrize('ignore_null_fields', ['true', 'false']) +@pytest.mark.parametrize('pretty', ['true', 'false']) +@allow_non_gpu('ProjectExec') +def test_structs_to_json_for_non_utc(data_gen, ignore_null_fields, pretty): + struct_gen = StructGen([ + ('a', data_gen), + ("b", StructGen([('child', data_gen)], nullable=True)), + ("c", ArrayGen(StructGen([('child', data_gen)], nullable=True))), + ("d", MapGen(LongGen(nullable=False), data_gen)), + ("d", MapGen(StringGen('[A-Za-z0-9]{0,10}', nullable=False), data_gen)), + ("e", ArrayGen(MapGen(LongGen(nullable=False), data_gen), nullable=True)), + ], nullable=False) + gen = StructGen([('my_struct', struct_gen)], nullable=False) + options = {'ignoreNullFields': ignore_null_fields, 'pretty': pretty} + def struct_to_json(spark): + df = gen_df(spark, gen) + return df.withColumn("my_json", f.to_json("my_struct", options)).drop("my_struct") + + conf = copy_and_update(_enable_all_types_conf, + {'spark.rapids.sql.expression.StructsToJson': True}) + assert_gpu_and_cpu_are_equal_collect(lambda spark: struct_to_json(spark), conf=conf) diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py index f6d4d5652f0..7facba97b92 100644 --- a/integration_tests/src/main/python/marks.py +++ b/integration_tests/src/main/python/marks.py @@ -32,3 +32,4 @@ delta_lake = pytest.mark.delta_lake large_data_test = pytest.mark.large_data_test pyarrow_test = pytest.mark.pyarrow_test +disable_timezone_test = pytest.mark.disable_timezone_test diff --git a/integration_tests/src/main/python/mortgage_test.py b/integration_tests/src/main/python/mortgage_test.py index aed9aa63c85..2c313597dbc 100644 --- a/integration_tests/src/main/python/mortgage_test.py +++ b/integration_tests/src/main/python/mortgage_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,13 +15,14 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_iterator -from marks import approximate_float, incompat, ignore_order, allow_non_gpu, limit +from marks import approximate_float, incompat, ignore_order, allow_non_gpu, limit, disable_timezone_test @incompat @approximate_float @limit @ignore_order @allow_non_gpu(any=True) +@disable_timezone_test def test_mortgage(mortgage): assert_gpu_and_cpu_are_equal_iterator( lambda spark : mortgage.do_test_query(spark)) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index b66903955bd..003c9fafe69 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -16,6 +16,7 @@ from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \ assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql +from conftest import is_utc, is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -62,6 +63,7 @@ def get_orc_timestamp_gen(nullable=True): reader_opt_confs_for_count = reader_opt_confs_common + [multithreaded_orc_file_reader_combine_unordered_conf] +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") @pytest.mark.parametrize('name', ['timestamp-date-test.orc']) @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @@ -75,6 +77,29 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, read_func(std_input_path + '/' + name), conf=all_confs) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'BatchScanExec', 'ColumnarToRowExec') +@pytest.mark.parametrize('name', ['timestamp-date-test.orc']) +@pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +def test_basic_read_for_non_utc(std_input_path, name, read_func, v1_enabled_list, orc_impl, reader_confs): + all_confs = copy_and_update(reader_confs, { + 'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.orc.impl': orc_impl}) + if v1_enabled_list == '' and 'read_orc_df' in str(read_func) and orc_impl != "hive": + assert_gpu_fallback_collect( + read_func(std_input_path + '/' + name), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + read_func(std_input_path + '/' + name), + 'FileSourceScanExec', + conf=all_confs) + # ORC does not support negative scale for decimal. So here is "decimal_gens_no_neg". # Otherwise it will get the below exception. # ... @@ -153,6 +178,7 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf): conf={disable_conf: 'false', "spark.sql.sources.useV1SourceList": "orc"}) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @@ -168,6 +194,31 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e read_func(data_path), conf=all_confs) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec', 'ProjectExec', 'BatchScanExec') +@pytest.mark.order(2) +@pytest.mark.parametrize('orc_gens', orc_gens_list, ids=idfn) +@pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +def test_read_round_trip_for_non_utc(spark_tmp_path, orc_gens, read_func, reader_confs, v1_enabled_list): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + data_path = spark_tmp_path + '/ORC_DATA' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(data_path)) + all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + if v1_enabled_list == '' and 'read_orc_df' in str(read_func): + assert_gpu_fallback_collect( + read_func(data_path), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + read_func(data_path), + 'FileSourceScanExec', + conf=all_confs) + + orc_pred_push_gens = [ byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, boolean_gen, string_gen, @@ -178,8 +229,10 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e # timestamp_gen orc_timestamp_gen] +orc_pred_push_gens_no_ts, orc_pred_push_gens_ts = split_list(orc_pred_push_gens, ['Timestamp']) + @pytest.mark.order(2) -@pytest.mark.parametrize('orc_gen', orc_pred_push_gens, ids=idfn) +@pytest.mark.parametrize('orc_gen', orc_pred_push_gens_no_ts, ids=idfn) @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @@ -198,6 +251,58 @@ def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func, v1_enabled_lis lambda spark: rf(spark).select(f.col('a') >= s0, f.col('s1.sa') >= s0, f.col('s2.sa.ssa') >= s0), conf=all_confs) + +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@pytest.mark.order(2) +@pytest.mark.parametrize('orc_gen', orc_pred_push_gens_ts, ids=idfn) +@pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +def test_pred_push_round_trip_2(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): + data_path = spark_tmp_path + '/ORC_DATA' + # Append two struct columns to verify nested predicate pushdown. + gen_list = [('a', RepeatSeqGen(orc_gen, 100)), ('b', orc_gen), + ('s1', StructGen([['sa', orc_gen]])), + ('s2', StructGen([['sa', StructGen([['ssa', orc_gen]])]]))] + s0 = with_cpu_session(lambda spark: gen_scalar(orc_gen, force_no_nulls=True)) + with_cpu_session( + lambda spark : gen_df(spark, gen_list).orderBy('a').write.orc(data_path)) + all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + rf = read_func(data_path) + assert_gpu_and_cpu_are_equal_collect( + lambda spark: rf(spark).select(f.col('a') >= s0, f.col('s1.sa') >= s0, f.col('s2.sa.ssa') >= s0), + conf=all_confs) + +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec', 'ProjectExec', 'BatchScanExec') +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@pytest.mark.order(2) +@pytest.mark.parametrize('orc_gen', orc_pred_push_gens_ts, ids=idfn) +@pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +def test_pred_push_round_trip_2_for_non_utc(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): + data_path = spark_tmp_path + '/ORC_DATA' + # Append two struct columns to verify nested predicate pushdown. + gen_list = [('a', RepeatSeqGen(orc_gen, 100)), ('b', orc_gen), + ('s1', StructGen([['sa', orc_gen]])), + ('s2', StructGen([['sa', StructGen([['ssa', orc_gen]])]]))] + s0 = with_cpu_session(lambda spark: gen_scalar(orc_gen, force_no_nulls=True)) + with_cpu_session( + lambda spark : gen_df(spark, gen_list).orderBy('a').write.orc(data_path)) + all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + rf = read_func(data_path) + if v1_enabled_list == '' and 'read_orc_df' in str(read_func): + assert_gpu_fallback_collect( + lambda spark: rf(spark).select(f.col('a') >= s0, f.col('s1.sa') >= s0, f.col('s2.sa.ssa') >= s0), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + lambda spark: rf(spark).select(f.col('a') >= s0, f.col('s1.sa') >= s0, f.col('s2.sa.ssa') >= s0), + 'FileSourceScanExec', + conf=all_confs) + + orc_compress_options = ['none', 'uncompressed', 'snappy', 'zlib'] # zstd is available in spark 3.2.0 and later. if not is_before_spark_320() and not is_spark_cdh(): @@ -237,6 +342,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea lambda spark : spark.read.orc(data_path), conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): @@ -261,6 +367,40 @@ def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): lambda spark: spark.read.orc(data_path), conf=all_confs) +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec', 'BatchScanExec') +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +def test_simple_partitioned_read_for_non_utc(spark_tmp_path, v1_enabled_list, reader_confs): + # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed + # we should go with a more standard set of generators + orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, + string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + orc_timestamp_gen] + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(first_data_path)) + second_data_path = spark_tmp_path + '/ORC_DATA/key=1/key2=21' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(second_data_path)) + third_data_path = spark_tmp_path + '/ORC_DATA/key=2/key2=22' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(third_data_path)) + data_path = spark_tmp_path + '/ORC_DATA' + all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + if v1_enabled_list == '': + assert_gpu_fallback_collect( + lambda spark: spark.read.orc(data_path), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + lambda spark: spark.read.orc(data_path), + 'FileSourceScanExec', + conf=all_confs) + + # Setup external table by altering column names def setup_external_table_with_forced_positions(spark, table_name, data_path): rename_cols_query = "CREATE EXTERNAL TABLE `{}` (`col10` INT, `_c1` STRING, `col30` DOUBLE) STORED AS orc LOCATION '{}'".format(table_name, data_path) @@ -303,6 +443,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade lambda spark : spark.read.orc(data_path).select("key"), conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): @@ -325,6 +466,39 @@ def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): lambda spark : spark.read.option('mergeSchema', 'true').orc(data_path), conf=all_confs) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec') +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +def test_merge_schema_read_for_non_utc(spark_tmp_path, v1_enabled_list, reader_confs): + # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed + # we should go with a more standard set of generators + orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, + string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + orc_timestamp_gen] + first_gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + first_data_path = spark_tmp_path + '/ORC_DATA/key=0' + with_cpu_session( + lambda spark : gen_df(spark, first_gen_list).write.orc(first_data_path)) + second_gen_list = [(('_c' if i % 2 == 0 else '_b') + str(i), gen) for i, gen in enumerate(orc_gens)] + second_data_path = spark_tmp_path + '/ORC_DATA/key=1' + with_cpu_session( + lambda spark : gen_df(spark, second_gen_list).write.orc(second_data_path)) + data_path = spark_tmp_path + '/ORC_DATA' + all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + if v1_enabled_list == '': + assert_gpu_fallback_collect( + lambda spark : spark.read.option('mergeSchema', 'true').orc(data_path), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + lambda spark : spark.read.option('mergeSchema', 'true').orc(data_path), + 'FileSourceScanExec', + conf=all_confs) + + @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) def test_read_orc_with_empty_clipped_schema(spark_tmp_path, v1_enabled_list, reader_confs): @@ -578,8 +752,8 @@ def test_read_struct_without_stream(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.read.orc(data_path)) - -@pytest.mark.parametrize('orc_gen', flattened_orc_gens, ids=idfn) +flattened_orc_gens_no_ts, flattened_orc_gens_ts = split_timestamp(flattened_orc_gens) +@pytest.mark.parametrize('orc_gen', flattened_orc_gens_no_ts, ids=idfn) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('case_sensitive', ["false", "true"]) @@ -607,6 +781,73 @@ def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enable lambda spark : spark.read.schema(rs).orc(data_path), conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@pytest.mark.parametrize('orc_gen', flattened_orc_gens_ts, ids=idfn) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('case_sensitive', ["false", "true"]) +def test_read_with_more_columns_2(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): + struct_gen = StructGen([('nested_col', orc_gen)]) + # Map is not supported yet. + gen_list = [("top_pri", orc_gen), + ("top_st", struct_gen), + ("top_ar", ArrayGen(struct_gen, max_length=10))] + data_path = spark_tmp_path + '/ORC_DATA' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(data_path)) + all_confs = reader_confs.copy() + all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.caseSensitive': case_sensitive}) + # This is a hack to get the type in a slightly less verbose way + extra_struct_gen = StructGen([('nested_col', orc_gen), ("nested_non_existing", orc_gen)]) + extra_gen_list = [("top_pri", orc_gen), + ("top_non_existing_mid", orc_gen), + ("TOP_AR", ArrayGen(extra_struct_gen, max_length=10)), + ("top_ST", extra_struct_gen), + ("top_non_existing_end", orc_gen)] + rs = StructGen(extra_gen_list, nullable=False).data_type + assert_gpu_and_cpu_are_equal_collect( + lambda spark : spark.read.schema(rs).orc(data_path), + conf=all_confs) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec', 'BatchScanExec', 'ProjectExec') +@pytest.mark.parametrize('orc_gen', flattened_orc_gens_ts, ids=idfn) +@pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.parametrize('case_sensitive', ["false", "true"]) +def test_read_with_more_columns_2_for_non_utc(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): + struct_gen = StructGen([('nested_col', orc_gen)]) + # Map is not supported yet. + gen_list = [("top_pri", orc_gen), + ("top_st", struct_gen), + ("top_ar", ArrayGen(struct_gen, max_length=10))] + data_path = spark_tmp_path + '/ORC_DATA' + with_cpu_session( + lambda spark : gen_df(spark, gen_list).write.orc(data_path)) + all_confs = reader_confs.copy() + all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.sql.caseSensitive': case_sensitive}) + # This is a hack to get the type in a slightly less verbose way + extra_struct_gen = StructGen([('nested_col', orc_gen), ("nested_non_existing", orc_gen)]) + extra_gen_list = [("top_pri", orc_gen), + ("top_non_existing_mid", orc_gen), + ("TOP_AR", ArrayGen(extra_struct_gen, max_length=10)), + ("top_ST", extra_struct_gen), + ("top_non_existing_end", orc_gen)] + rs = StructGen(extra_gen_list, nullable=False).data_type + if v1_enabled_list == '': + assert_gpu_fallback_collect( + lambda spark : spark.read.schema(rs).orc(data_path), + 'BatchScanExec', + conf=all_confs) + else: + assert_gpu_fallback_collect( + lambda spark : spark.read.schema(rs).orc(data_path), + 'FileSourceScanExec', + conf=all_confs) + + @pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330') @allow_non_gpu(any = True) @pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"]) @@ -768,9 +1009,10 @@ def test_orc_read_varchar_as_string(std_input_path): lambda spark: spark.read.schema("id bigint, name string").orc(std_input_path + "/test_orc_varchar.orc")) -@pytest.mark.parametrize('gens', orc_gens_list, ids=idfn) +orc_gens_list_no_ts, orc_gens_list_ts = split_timestamp(orc_gens_list) +@pytest.mark.parametrize('gens', orc_gens_list_no_ts, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_order): +def test_read_round_trip_for_multithreaded_combining_1(spark_tmp_path, gens, keep_order): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] data_path = spark_tmp_path + '/ORC_DATA' # 50 partitions to generate enough small files @@ -783,11 +1025,44 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ lambda spark: spark.read.orc(data_path), conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@pytest.mark.parametrize('gens', orc_gens_list_ts, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) -def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): +def test_read_round_trip_for_multithreaded_combining_2(spark_tmp_path, gens, keep_order): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] + data_path = spark_tmp_path + '/ORC_DATA' + # 50 partitions to generate enough small files + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(data_path)) + all_confs = {'spark.rapids.sql.format.orc.reader.type': 'MULTITHREADED', + 'spark.rapids.sql.reader.multithreaded.combine.sizeBytes': '64m', + 'spark.rapids.sql.reader.multithreaded.read.keepOrder': keep_order} + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.read.orc(data_path), conf=all_confs) + + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec') +@pytest.mark.parametrize('gens', orc_gens_list_ts, ids=idfn) +@pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +def test_read_round_trip_for_multithreaded_combining_2_for_non_utc(spark_tmp_path, gens, keep_order): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] + data_path = spark_tmp_path + '/ORC_DATA' + # 50 partitions to generate enough small files + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(data_path)) + all_confs = {'spark.rapids.sql.format.orc.reader.type': 'MULTITHREADED', + 'spark.rapids.sql.reader.multithreaded.combine.sizeBytes': '64m', + 'spark.rapids.sql.reader.multithreaded.read.keepOrder': keep_order} + assert_gpu_fallback_collect( + lambda spark: spark.read.orc(data_path), + 'FileSourceScanExec', + conf=all_confs) + +@pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +def test_simple_partitioned_read_for_multithreaded_combining_1(spark_tmp_path, keep_order): orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), - orc_timestamp_gen] + string_gen, boolean_gen, DateGen(start=date(1590, 1, 1))] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' with_cpu_session( @@ -805,6 +1080,50 @@ def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, kee assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.orc(data_path), conf=all_confs) +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +def test_simple_partitioned_read_for_multithreaded_combining_2(spark_tmp_path, keep_order): + orc_gens = [orc_timestamp_gen] + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(first_data_path)) + second_data_path = spark_tmp_path + '/ORC_DATA/key=1/key2=21' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(second_data_path)) + third_data_path = spark_tmp_path + '/ORC_DATA/key=2/key2=22' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(third_data_path)) + data_path = spark_tmp_path + '/ORC_DATA' + all_confs = {'spark.rapids.sql.format.orc.reader.type': 'MULTITHREADED', + 'spark.rapids.sql.reader.multithreaded.combine.sizeBytes': '64m', + 'spark.rapids.sql.reader.multithreaded.read.keepOrder': keep_order} + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.read.orc(data_path), conf=all_confs) + +@pytest.mark.skipif(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support ORC file with timestamp") +@allow_non_gpu('FileSourceScanExec', 'ColumnarToRowExec') +@pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +def test_simple_partitioned_read_for_multithreaded_combining_2_for_non_utc(spark_tmp_path, keep_order): + orc_gens = [orc_timestamp_gen] + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(first_data_path)) + second_data_path = spark_tmp_path + '/ORC_DATA/key=1/key2=21' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(second_data_path)) + third_data_path = spark_tmp_path + '/ORC_DATA/key=2/key2=22' + with_cpu_session( + lambda spark: gen_df(spark, gen_list).repartition(50).write.orc(third_data_path)) + data_path = spark_tmp_path + '/ORC_DATA' + all_confs = {'spark.rapids.sql.format.orc.reader.type': 'MULTITHREADED', + 'spark.rapids.sql.reader.multithreaded.combine.sizeBytes': '64m', + 'spark.rapids.sql.reader.multithreaded.read.keepOrder': keep_order} + assert_gpu_fallback_collect( + lambda spark: spark.read.orc(data_path), + 'FileSourceScanExec', + conf=all_confs) @pytest.mark.skipif(is_spark_340_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index b3e04b91d93..46d8d89e903 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -1013,7 +1013,7 @@ def test_parquet_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): No aggregation will be pushed down in this test, so we should not fallback to CPU """ data_path = spark_tmp_path + "/pushdown.parquet" - + conf = copy_and_update(conf_for_parquet_aggregate_pushdown, writer_confs_for_DB) def do_parquet_scan(spark): spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").mode("overwrite").parquet(data_path) df = spark.read.parquet(data_path).selectExpr("Max(p)") @@ -1021,7 +1021,7 @@ def do_parquet_scan(spark): assert_gpu_and_cpu_are_equal_collect( do_parquet_scan, - conf_for_parquet_aggregate_pushdown + conf=conf ) @@ -1287,7 +1287,8 @@ def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] # write DayTimeInterval with GPU - with_gpu_session(lambda spark :gen_df(spark, gen_list).coalesce(1).write.mode("overwrite").parquet(data_path)) + with_gpu_session(lambda spark :gen_df(spark, gen_list).coalesce(1).write.mode("overwrite").parquet(data_path), + conf=writer_confs_for_DB) assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.parquet(data_path)) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 6c8b8e4dca5..e97dd7b0637 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write, assert_spark_exception +from conftest import is_utc, is_not_utc from datetime import date, datetime, timezone from data_gen import * from enum import Enum @@ -124,7 +125,8 @@ def test_write_round_trip_corner(spark_tmp_path, par_gen): assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list, 128000, num_slices=1).write.parquet(path), lambda spark, path: spark.read.parquet(path), - data_path) + data_path, + conf=writer_confs_for_DB) @pytest.mark.parametrize('parquet_gens', [[ TimestampGen(), @@ -232,6 +234,7 @@ def start(self, rand): def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} + all_confs = copy_and_update(all_confs, writer_confs_for_DB) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.parquet(path), lambda spark, path : spark.read.parquet(path), @@ -305,9 +308,11 @@ def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory) data_path = spark_tmp_path + '/PARQUET_DATA' table_name = spark_tmp_table_factory.get() with_gpu_session( - lambda spark : unary_op_df(spark, gen).coalesce(1).write.format("parquet").mode('overwrite').option("path", data_path).saveAsTable(table_name)) + lambda spark : unary_op_df(spark, gen).coalesce(1).write.format("parquet").mode('overwrite').option("path", data_path).saveAsTable(table_name), + conf=writer_confs_for_DB) with_gpu_session( - lambda spark : writeParquetNoOverwriteCatchException(spark, unary_op_df(spark, gen), data_path, table_name)) + lambda spark : writeParquetNoOverwriteCatchException(spark, unary_op_df(spark, gen), data_path, table_name), + conf=writer_confs_for_DB) @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @@ -468,7 +473,8 @@ def generate_map_with_empty_validity(spark, path): assert_gpu_and_cpu_writes_are_equal_collect( generate_map_with_empty_validity, lambda spark, path: spark.read.parquet(path), - data_path) + data_path, + conf=writer_confs_for_DB) @pytest.mark.parametrize('ts_write_data_gen', [('INT96', TimestampGen()), ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1582, 1, 1, tzinfo=timezone.utc))), @@ -495,9 +501,10 @@ def test_timestamp_roundtrip_no_legacy_rebase(spark_tmp_path, ts_write_data_gen, # This should be merged to `test_timestamp_roundtrip_no_legacy_rebase` above when # we have rebase for int96 supported. +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") @pytest.mark.parametrize('ts_write', ['TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']) @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) -def test_datetime_roundtrip_with_legacy_rebase(spark_tmp_path, ts_write, data_gen): +def test_datetime_roundtrip_with_legacy_rebase_for_utc(spark_tmp_path, ts_write, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'LEGACY', @@ -510,6 +517,27 @@ def test_datetime_roundtrip_with_legacy_rebase(spark_tmp_path, ts_write, data_ge lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs) + +# This should be merged to `test_timestamp_roundtrip_no_legacy_rebase` above when +# we have rebase for int96 supported. +@allow_non_gpu('DataWritingCommandExec', 'WriteFilesExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@pytest.mark.parametrize('ts_write', ['TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']) +@pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) +def test_datetime_roundtrip_with_legacy_rebase_for_non_utc(spark_tmp_path, ts_write, data_gen): + data_path = spark_tmp_path + '/PARQUET_DATA' + all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, + 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'LEGACY', + 'spark.sql.legacy.parquet.datetimeRebaseModeInRead': 'CORRECTED', + # set the int96 rebase mode values because its LEGACY in databricks which will preclude this op from running on GPU + 'spark.sql.legacy.parquet.int96RebaseModeInWrite' : 'CORRECTED', + 'spark.sql.legacy.parquet.int96RebaseModeInRead' : 'CORRECTED'} + assert_gpu_fallback_write( + lambda spark, path: unary_op_df(spark, data_gen).coalesce(1).write.parquet(path), + lambda spark, path: spark.read.parquet(path), + data_path, + ['DataWritingCommandExec'], + conf=all_confs) test_non_empty_ctas_non_gpu_execs = ["DataWritingCommandExec", "InsertIntoHiveTable", "WriteFilesExec"] if is_spark_340_or_later() or is_databricks122_or_later() else ["DataWritingCommandExec", "HiveTableScanExec"] @@ -521,6 +549,7 @@ def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty "spark.sql.hive.convertCTAS": "true", "spark.sql.legacy.allowNonEmptyLocationInCTAS": str(allow_non_empty) } + conf = copy_and_update(conf, writer_confs_for_DB) def test_it(spark): src_name = spark_tmp_table_factory.get() spark.sql("CREATE TABLE {}(id string) LOCATION '{}/src1'".format(src_name, data_path)) @@ -564,10 +593,11 @@ def get_nested_parquet_meta_data_for_field_id(): def test_parquet_write_field_id(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() + conf = copy_and_update(enable_parquet_field_id_read, writer_confs_for_DB) with_gpu_session( # default write Parquet IDs lambda spark: spark.createDataFrame(data, schema).coalesce(1).write.mode("overwrite") - .parquet(data_path), conf=enable_parquet_field_id_write) + .parquet(data_path), conf=conf) # check data, for schema check refer to Scala test case `ParquetFieldIdSuite` assert_gpu_and_cpu_writes_are_equal_collect( @@ -575,24 +605,26 @@ def test_parquet_write_field_id(spark_tmp_path): .mode("overwrite").parquet(path), lambda spark, path: spark.read.parquet(path), data_path, - conf=enable_parquet_field_id_read) + conf=conf) @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') def test_parquet_write_field_id_disabled(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() + conf = copy_and_update(disable_parquet_field_id_write, writer_confs_for_DB) with_gpu_session( lambda spark: spark.createDataFrame(data, schema).coalesce(1).write.mode("overwrite") .parquet(data_path), - conf=disable_parquet_field_id_write) # disable write Parquet IDs + conf=conf) # disable write Parquet IDs + conf = copy_and_update(enable_parquet_field_id_read, writer_confs_for_DB) # check data, for schema check refer to Scala test case `ParquetFieldIdSuite` assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: spark.createDataFrame(data, schema).coalesce(1).write .mode("overwrite").parquet(path), lambda spark, path: spark.read.parquet(path), data_path, - conf=enable_parquet_field_id_read) + conf=conf) @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') @@ -616,6 +648,7 @@ def test_concurrent_writer(spark_tmp_path): lambda spark, path: spark.read.parquet(path), data_path, copy_and_update( + writer_confs_for_DB, # 26 > 25, will not fall back to single writer {"spark.sql.maxConcurrentOutputFileWriters": 26} )) @@ -756,7 +789,7 @@ def write_partitions(spark, table_path): lambda spark, path: write_partitions(spark, path), lambda spark, path: spark.read.parquet(path), base_output_path, - conf={} + conf=writer_confs_for_DB ) def hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase, func): @@ -829,7 +862,7 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, def test_write_list_struct_single_element(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' data_gen = ArrayGen(StructGen([('element', long_gen)], nullable=False), max_length=10, nullable=False) - conf = {} + conf = writer_confs_for_DB assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, data_gen).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf) @@ -849,4 +882,5 @@ def test_parquet_write_column_name_with_dots(spark_tmp_path): assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gens).coalesce(1).write.parquet(path), lambda spark, path: spark.read.parquet(path), - data_path) + data_path, + conf=writer_confs_for_DB) diff --git a/integration_tests/src/main/python/qa_nightly_select_test.py b/integration_tests/src/main/python/qa_nightly_select_test.py index ba3414e51fe..585e7bbff7c 100644 --- a/integration_tests/src/main/python/qa_nightly_select_test.py +++ b/integration_tests/src/main/python/qa_nightly_select_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,12 +18,14 @@ import pyspark.sql.functions as f import datetime from argparse import ArgumentParser +from conftest import is_utc, is_not_utc +from data_gen import split_list from decimal import Decimal -from asserts import assert_gpu_and_cpu_are_equal_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect from qa_nightly_sql import * import pytest from spark_session import with_cpu_session, is_jvm_charset_utf8 -from marks import approximate_float, ignore_order, incompat, qarun +from marks import approximate_float, ignore_order, incompat, qarun, allow_non_gpu from data_gen import copy_and_update def num_stringDf(spark): @@ -154,10 +156,12 @@ def idfn(val): # some of the first/last tests need a single partition to work reliably when run on a large cluster. 'spark.sql.shuffle.partitions': '1'}) +SELECT_SQL_ALL_PASSED, SELECT_SQL_UTC = split_list(SELECT_SQL, SELECT_SQL_UTC_ONLY) + @approximate_float @incompat @qarun -@pytest.mark.parametrize('sql_query_line', SELECT_SQL, ids=idfn) +@pytest.mark.parametrize('sql_query_line', SELECT_SQL_ALL_PASSED, ids=idfn) def test_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -165,11 +169,42 @@ def test_select(sql_query_line, pytestconfig): with_cpu_session(num_stringDf) assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql(sql_query), conf=_qa_conf) +@qarun +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@pytest.mark.parametrize('sql_query_line', SELECT_SQL_UTC, ids=idfn) +def test_select_for_utc(sql_query_line, pytestconfig): + sql_query = sql_query_line[0] + if sql_query: + print(sql_query) + with_cpu_session(num_stringDf) + assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql(sql_query), conf=_qa_conf) + +@qarun +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('sql_query_line', SELECT_SQL_UTC, ids=idfn) +def test_select_for_non_utc(sql_query_line, pytestconfig): + sql_query = sql_query_line[0] + if sql_query: + print(sql_query) + if ("hour" in sql_query): + fallback_class = "Hour" + elif ("minute" in sql_query): + fallback_class = "Minute" + elif ("second" in sql_query): + fallback_class = "Second" + else: + fallback_class = "Cast" + with_cpu_session(num_stringDf) + assert_gpu_fallback_collect(lambda spark: spark.sql(sql_query), fallback_class, conf=_qa_conf) + +SELECT_NEEDS_SORT_SQL_ALL_PASSED, SELECT_NEEDS_SORT_SQL_UTC = split_list(SELECT_NEEDS_SORT_SQL, SELECT_SQL_UTC_ONLY) + @ignore_order @approximate_float @incompat @qarun -@pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL, ids=idfn) +@pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL_ALL_PASSED, ids=idfn) def test_needs_sort_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -177,6 +212,28 @@ def test_needs_sort_select(sql_query_line, pytestconfig): with_cpu_session(num_stringDf) assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql(sql_query), conf=_qa_conf) +@ignore_order +@qarun +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL_UTC, ids=idfn) +def test_needs_sort_select_for_utc(sql_query_line, pytestconfig): + sql_query = sql_query_line[0] + if sql_query: + print(sql_query) + with_cpu_session(num_stringDf) + assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql(sql_query), conf=_qa_conf) + +@ignore_order +@qarun +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@allow_non_gpu('ProjectExec') +@pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL_UTC, ids=idfn) +def test_needs_sort_select_for_non_utc(sql_query_line, pytestconfig): + sql_query = sql_query_line[0] + if sql_query: + with_cpu_session(num_stringDf) + assert_gpu_fallback_collect(lambda spark: spark.sql(sql_query), "Cast", conf=_qa_conf) + @approximate_float @incompat @ignore_order(local=True) diff --git a/integration_tests/src/main/python/qa_nightly_sql.py b/integration_tests/src/main/python/qa_nightly_sql.py index c5432091c78..0101381d4e5 100644 --- a/integration_tests/src/main/python/qa_nightly_sql.py +++ b/integration_tests/src/main/python/qa_nightly_sql.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -547,6 +547,22 @@ ("SELECT CASE WHEN timestampF > '2020-05-01 12:01:015' THEN 'good time' ELSE 'bad time' END FROM test_table", "CASE WHEN timestampF > '2020-05-01 12:01:015' THEN 'good time' ELSE 'bad time' END"), ] +SELECT_SQL_UTC_ONLY = [ +("SELECT dayofmonth(timestampF) from test_table", "dayofmonth(timestampF)"), +("SELECT hour(timestampF) from test_table", "hour(timestampF)"), +("SELECT minute(timestampF) from test_table", "minute(timestampF)"), +("SELECT second(timestampF) from test_table", "second(timestampF)"), +("SELECT year(timestampF) from test_table", "year(timestampF)"), +("SELECT month(timestampF) from test_table", "month(timestampF)"), +("SELECT IFNULL(dateF, 'nobody') as if_null FROM test_table", "IFNULL(dateF, 'nobody')"), +("SELECT IFNULL(timestampF, 'nobody') as if_null FROM test_table", "IFNULL(timestampF, 'nobody')"), +("SELECT NVL(dateF, '1990-1-1') as nvl_value FROM test_table", "NVL(dateF, '1990-1-1')"), +("SELECT NVL(timestampF, '2022-12-01 12:01:01') as nvl_value FROM test_table", "NVL(timestampF, '2022-12-01 12:01:01')"), +("SELECT dateF, COALESCE(dateF,'N/A') FROM test_table", "dateF, COALESCE(dateF,'N/A')"), +("SELECT timestampF, COALESCE(timestampF,'N/A') FROM test_table", "timestampF, COALESCE(timestampF,'N/A')"), +("SELECT SUM(byteF) OVER (PARTITION BY byteF ORDER BY CAST(dateF AS TIMESTAMP) RANGE BETWEEN INTERVAL 1 DAYS PRECEDING AND INTERVAL 1 DAYS FOLLOWING ) as sum_total FROM test_table", "SUM(byteF) OVER (PARTITION BY byteF ORDER BY CAST(dateF AS TIMESTAMP) RANGE BETWEEN INTERVAL 1 DAYS PRECEDING AND INTERVAL 1 DAYS FOLLOWING ) as sum_total"), +] + SELECT_NEEDS_SORT_SQL = [ # (" AGG functions", "AGG functions"), ("SELECT AVG(intF) FROM test_table", "AVG(intF)"), diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index df6f1329471..116b30d3b87 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -55,16 +55,6 @@ def _from_scala_map(scala_map): 'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true', } -def is_tz_utc(spark=_spark): - """ - true if the tz is UTC else false - """ - # Now we have to do some kind of ugly internal java stuff - jvm = spark.sparkContext._jvm - utc = jvm.java.time.ZoneId.of('UTC').normalized() - sys_tz = jvm.java.time.ZoneId.systemDefault().normalized() - return utc == sys_tz - def _set_all_confs(conf): newconf = _default_conf.copy() if (should_inject_oom()): diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 7b39594894a..171597ca3bb 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -14,7 +14,8 @@ import math import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_sql_fallback_collect +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_sql_fallback_collect, assert_gpu_fallback_sql +from conftest import is_utc, is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -1447,9 +1448,9 @@ def test_window_first_last_nth_ignore_nulls(data_gen): 'SELECT a, b, c, ' + exprs_for_nth_first_last_ignore_nulls + 'FROM window_agg_table') - +@pytest.mark.xfail(is_not_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") @ignore_order(local=True) -def test_to_date_with_window_functions(): +def test_to_date_with_window_functions_for_utc(): """ This test ensures that date expressions participating alongside window aggregations are initialized correctly. (See: https://github.com/NVIDIA/spark-rapids/issues/5984) @@ -1486,6 +1487,22 @@ def test_to_date_with_window_functions(): """ ) +@allow_non_gpu('ProjectExec') +@pytest.mark.xfail(is_utc(), reason="TODO sub-issue in https://github.com/NVIDIA/spark-rapids/issues/9653 to support non-UTC tz for Cast from StringType to DateType") +@ignore_order(local=True) +def test_to_date_with_window_functions_for_non_utc(): + assert_gpu_fallback_sql( + df_fun=lambda spark: gen_df(spark, [('id', RepeatSeqGen(int_gen, 20)), + ('date_1', DateGen()), + ('date_2', DateGen())]), + table_name="window_input", + sql=""" + SELECT TO_DATE( CAST(date_1 AS STRING), 'yyyy-MM-dd' ) AS my_date, + SUM(1) OVER(PARTITION BY id ORDER BY date_2) AS my_sum + FROM window_input + """, + fallback_class_name="Cast" + ) @ignore_order(local=True) @approximate_float diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index 5cfde1c9d15..467020d4151 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -90,13 +90,17 @@ abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpre recursiveTagExprForGpuCheck() } + // tag time zone by Cast itself, do not delegate to parent class + override def tagTimeZoneBySelf: Boolean = true + protected def recursiveTagExprForGpuCheck( fromDataType: DataType = fromType, toDataType: DataType = toType, depth: Int = 0): Unit = { val checks = rule.getChecks.get.asInstanceOf[CastChecks] + val checkUtc = !conf.nonUtcTimeZoneEnabled if (depth > 0 && - !checks.gpuCanCast(fromDataType, toDataType)) { + !checks.gpuCanCast(fromDataType, toDataType, checkUtcTimeZone = checkUtc)) { willNotWorkOnGpu(s"Casting child type $fromDataType to $toDataType is not supported") } @@ -177,9 +181,6 @@ abstract class CastExprMetaBase[INPUT <: UnaryExpression with TimeZoneAwareExpre def buildTagMessage(entry: ConfEntry[_]): String = { s"${entry.doc}. To enable this operation on the GPU, set ${entry.key} to true." } - - // timezone tagging in type checks is good enough, so always false - override protected val needTimezoneTagging: Boolean = false } object CastOptions { diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 77fb39b9546..53357829c11 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -669,9 +669,7 @@ object GpuOverrides extends Logging { case FloatType => true case DoubleType => true case DateType => true - case TimestampType => - TypeChecks.areTimestampsSupported(ZoneId.systemDefault()) && - TypeChecks.areTimestampsSupported(SQLConf.get.sessionLocalTimeZone) + case TimestampType => true case StringType => true case dt: DecimalType if allowDecimal => dt.precision <= DType.DECIMAL64_MAX_PRECISION case NullType => allowNull @@ -1728,11 +1726,11 @@ object GpuOverrides extends Logging { GpuMinute(expr) }), expr[Second]( + "Returns the second component of the string/timestamp", ExprChecks.unaryProject(TypeSig.INT, TypeSig.INT, TypeSig.TIMESTAMP, TypeSig.TIMESTAMP), (second, conf, p, r) => new UnaryExprMeta[Second](second, conf, p, r) { - override def convertToGpu(expr: Expression): GpuExpression = GpuSecond(expr) }), diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala index 34f6777c26f..3ff50f8d1d5 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala @@ -199,7 +199,9 @@ object GpuParquetScan { s"${RapidsConf.ENABLE_PARQUET_READ} to true") } - FileFormatChecks.tag(meta, readSchema, ParquetFormatType, ReadFileOp) + val checkUtc = !meta.conf.nonUtcTimeZoneEnabled + FileFormatChecks.tag(meta, readSchema, ParquetFormatType, ReadFileOp, + checkUtcTimeZone = checkUtc) val schemaHasTimestamps = readSchema.exists { field => TrampolineUtil.dataTypeExistsRecursively(field.dataType, _.isInstanceOf[TimestampType]) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala index 6520ff4c1b7..d386585ef14 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala @@ -374,13 +374,12 @@ class RapidsExecutorPlugin extends ExecutorPlugin with Logging { case Some(value) => ZoneId.of(value) case None => throw new RuntimeException(s"Driver time zone cannot be determined.") } - if (TypeChecks.areTimestampsSupported(driverTimezone)) { - val executorTimezone = ZoneId.systemDefault() - if (executorTimezone.normalized() != driverTimezone.normalized()) { - throw new RuntimeException(s" Driver and executor timezone mismatch. " + - s"Driver timezone is $driverTimezone and executor timezone is " + - s"$executorTimezone. Set executor timezone to $driverTimezone.") - } + + val executorTimezone = ZoneId.systemDefault() + if (executorTimezone.normalized() != driverTimezone.normalized()) { + throw new RuntimeException(s" Driver and executor timezone mismatch. " + + s"Driver timezone is $driverTimezone and executor timezone is " + + s"$executorTimezone. Set executor timezone to $driverTimezone.") } GpuCoreDumpHandler.executorInit(conf, pluginContext) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 2d421b9a6b3..528f3d74faa 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -2049,6 +2049,13 @@ object RapidsConf { .longConf .createOptional + val NON_UTC_TIME_ZONE_ENABLED = conf("spark.rapids.sql.nonUtc.enabled") + .doc("An option to enable/disable non UTC time zone support.") + .startupOnly() + .internal() + .booleanConf + .createWithDefault(true) + private def printSectionHeader(category: String): Unit = println(s"\n### $category") @@ -2741,6 +2748,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val splitUntilSizeOverride: Option[Long] = get(SPLIT_UNTIL_SIZE_OVERRIDE) + lazy val nonUtcTimeZoneEnabled: Boolean = get(NON_UTC_TIME_ZONE_ENABLED) + private val optimizerDefaults = Map( // this is not accurate because CPU projections do have a cost due to appending values // to each row that is produced, but this needs to be a really small number because diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala index 445a99051b5..29d6f723e2a 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala @@ -22,7 +22,7 @@ import scala.collection.mutable import com.nvidia.spark.rapids.shims.{DistributionUtil, SparkShimImpl} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryExpression, ComplexTypeMergingExpression, Expression, QuaternaryExpression, String2TrimExpression, TernaryExpression, TimeZoneAwareExpression, UnaryExpression, WindowExpression, WindowFunction} +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeTag @@ -1071,22 +1071,9 @@ abstract class BaseExprMeta[INPUT <: Expression]( val isFoldableNonLitAllowed: Boolean = false - /** - * Whether to tag a TimeZoneAwareExpression for timezone after all the other tagging - * is done. - * By default a TimeZoneAwareExpression always requires the timezone tagging, but - * there are some exceptions, e.g. 'Cast', who requires timezone tagging only when it - * has timezone sensitive type as input or output. - * - * Override this to match special cases. - */ - protected def needTimezoneTagging: Boolean = { - // A TimeZoneAwareExpression with no timezone sensitive types as input/output will - // escape from the timezone tagging in the prior type checks. So ask for tagging here. - // e.g. 'UnixTimestamp' with 'DateType' as the input, timezone will be taken into - // account when converting a Date to a Long. - !(dataType +: childExprs.map(_.dataType)).exists(TypeChecks.isTimezoneSensitiveType) - } + // For common expr like AttributeReference, just skip the UTC checking + lazy val skipUtcCheckForCommonExpr: Boolean = conf.nonUtcTimeZoneEnabled && + expr.isInstanceOf[AttributeReference] final override def tagSelfForGpu(): Unit = { if (wrapped.foldable && !GpuOverrides.isLit(wrapped) && !isFoldableNonLitAllowed) { @@ -1095,13 +1082,32 @@ abstract class BaseExprMeta[INPUT <: Expression]( } rule.getChecks.foreach(_.tag(this)) tagExprForGpu() - wrapped match { - case tzAware: TimeZoneAwareExpression if needTimezoneTagging => - checkTimeZoneId(tzAware.zoneId) - case _ => // do nothing + + if (!skipUtcCheckForCommonExpr) { + // if expr is time zone aware and GPU does not support non UTC tz for this expr yet, + // ensure it's in UTC tz + if (!tagTimeZoneBySelf && isTimeZoneAwareExpr && !supportsNonUTCTimeZone) { + checkTimeZoneId(expr.asInstanceOf[TimeZoneAwareExpression].zoneId) + } } } + // if the wrapped expression is time zone aware + private final def isTimeZoneAwareExpr: Boolean = expr.isInstanceOf[TimeZoneAwareExpression] + + /** + * whether the GPU supports non UTC time zone, for each expression that supports non UTC time + * zone, should override this method to return true + */ + def supportsNonUTCTimeZone: Boolean = false + + /** + * For cast expr or might other exprs, it's time zone aware, but time zone check can be skipped + * for some input/output types, like cast(int as long). For this kind of expr, should override + * this method and return true which means this Expr Meta should check time zone itself + */ + def tagTimeZoneBySelf: Boolean = false + /** * Called to verify that this expression will work on the GPU. For most expressions without * extra checks all of the checks should have already been done. diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala index 77cd8cee7fa..4ad92392bbe 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala @@ -321,11 +321,14 @@ final class TypeSig private( /** * Check if this type is supported by the plugin or not. + * * @param dataType the data type to be checked + * @param checkUtcTimeZone whether to check UTC time zone. + * Set false when fully support time zones(utc and non utc) * @return true if it is allowed else false. */ - def isSupportedByPlugin(dataType: DataType): Boolean = - isSupported(initialTypes, dataType) + def isSupportedByPlugin(dataType: DataType, checkUtcTimeZone: Boolean = true): Boolean = + isSupported(initialTypes, dataType, checkUtcTimeZone) private [this] def isLitOnly(dataType: DataType): Boolean = dataType match { case BooleanType => litOnlyTypes.contains(TypeEnum.BOOLEAN) @@ -348,12 +351,24 @@ final class TypeSig private( case _ => TypeSigUtil.isSupported(litOnlyTypes, dataType) } - def isSupportedBySpark(dataType: DataType): Boolean = - isSupported(initialTypes, dataType) + /** + * + * @param checkUtcTimeZone whether to check UTC time zone. + * Set false when fully support time zones(utc and non utc) + * @return + */ + def isSupportedBySpark(dataType: DataType, checkUtcTimeZone: Boolean = true): Boolean = + isSupported(initialTypes, dataType, checkUtcTimeZone) + /** + * + * @param checkUtcTimeZone whether to check UTC time zone. + * Set false when fully support time zones(utc and non utc) + */ private[this] def isSupported( check: TypeEnum.ValueSet, - dataType: DataType): Boolean = + dataType: DataType, + checkUtcTimeZone: Boolean): Boolean = dataType match { case BooleanType => check.contains(TypeEnum.BOOLEAN) case ByteType => check.contains(TypeEnum.BYTE) @@ -364,7 +379,11 @@ final class TypeSig private( case DoubleType => check.contains(TypeEnum.DOUBLE) case DateType => check.contains(TypeEnum.DATE) case TimestampType if check.contains(TypeEnum.TIMESTAMP) => + if (checkUtcTimeZone) { TypeChecks.areTimestampsSupported() + } else { + true + } case StringType => check.contains(TypeEnum.STRING) case dt: DecimalType => check.contains(TypeEnum.DECIMAL) && @@ -373,13 +392,13 @@ final class TypeSig private( case BinaryType => check.contains(TypeEnum.BINARY) case CalendarIntervalType => check.contains(TypeEnum.CALENDAR) case ArrayType(elementType, _) if check.contains(TypeEnum.ARRAY) => - isSupported(childTypes, elementType) + isSupported(childTypes, elementType, checkUtcTimeZone) case MapType(keyType, valueType, _) if check.contains(TypeEnum.MAP) => - isSupported(childTypes, keyType) && - isSupported(childTypes, valueType) + isSupported(childTypes, keyType, checkUtcTimeZone) && + isSupported(childTypes, valueType, checkUtcTimeZone) case StructType(fields) if check.contains(TypeEnum.STRUCT) => fields.map(_.dataType).forall { t => - isSupported(childTypes, t) + isSupported(childTypes, t, checkUtcTimeZone) } case _ => TypeSigUtil.isSupported(check, dataType) } @@ -479,7 +498,7 @@ final class TypeSig private( } def areAllSupportedByPlugin(types: Seq[DataType]): Boolean = - types.forall(isSupportedByPlugin) + types.forall(isSupportedByPlugin(_)) /** * Get the level of support for a given type compared to what Spark supports. @@ -808,10 +827,11 @@ abstract class TypeChecks[RET] { meta: RapidsMeta[_, _, _], sig: TypeSig, fields: Seq[StructField], - msgFormat: String + msgFormat: String, + checkUtcTimeZone: Boolean = true ): Unit = { val unsupportedTypes: Map[DataType, Set[String]] = fields - .filterNot(attr => sig.isSupportedByPlugin(attr.dataType)) + .filterNot(attr => sig.isSupportedByPlugin(attr.dataType, checkUtcTimeZone)) .groupBy(_.dataType) .mapValues(_.map(_.name).toSet).toMap @@ -841,10 +861,6 @@ object TypeChecks { areTimestampsSupported(SQLConf.get.sessionLocalTimeZone) } - def isTimezoneSensitiveType(dataType: DataType): Boolean = { - dataType == TimestampType - } - def timezoneNotSupportedString(dataType: DataType): String = { s"$dataType is not supported with timezone settings: (JVM:" + s" ${ZoneId.systemDefault()}, session: ${SQLConf.get.sessionLocalTimeZone})." + @@ -893,7 +909,7 @@ case class ContextChecks( val expr = meta.wrapped.asInstanceOf[Expression] meta.typeMeta.dataType match { case Some(dt: DataType) => - if (!outputCheck.isSupportedByPlugin(dt)) { + if (!outputCheck.isSupportedByPlugin(dt, !meta.skipUtcCheckForCommonExpr)) { willNotWork(s"expression ${expr.getClass.getSimpleName} $expr " + s"produces an unsupported type $dt") } @@ -949,9 +965,10 @@ class FileFormatChecks private ( def tag(meta: RapidsMeta[_, _, _], schema: StructType, fileType: FileFormatType, - op: FileFormatOp): Unit = { + op: FileFormatOp, + checkUtcTimeZone: Boolean): Unit = { tagUnsupportedTypes(meta, sig, schema.fields, - s"unsupported data types %s in $op for $fileType") + s"unsupported data types %s in $op for $fileType", checkUtcTimeZone) } override def support(dataType: TypeEnum.Value): SupportLevel = @@ -986,8 +1003,9 @@ object FileFormatChecks { def tag(meta: RapidsMeta[_, _, _], schema: StructType, fileType: FileFormatType, - op: FileFormatOp): Unit = { - GpuOverrides.fileFormats(fileType)(op).tag(meta, schema, fileType, op) + op: FileFormatOp, + checkUtcTimeZone: Boolean = true): Unit = { + GpuOverrides.fileFormats(fileType)(op).tag(meta, schema, fileType, op, checkUtcTimeZone) } } @@ -1004,6 +1022,17 @@ class ExecChecks private( override val shown: Boolean = true) extends TypeChecks[Map[String, SupportLevel]] { + def nonUtcTimeZoneEnable(): Boolean = { + val key = RapidsConf.NON_UTC_TIME_ZONE_ENABLED.key + val nonUtc = SQLConf.get.getConfString(key, "true") + try { + nonUtc.trim.toBoolean + } catch { + case _: IllegalArgumentException => + throw new IllegalArgumentException(s"$key should be boolean, but was $nonUtc") + } + } + override def tag(rapidsMeta: RapidsMeta[_, _, _]): Unit = { val meta = rapidsMeta.asInstanceOf[SparkPlanMeta[_]] @@ -1023,6 +1052,7 @@ class ExecChecks private( s"is missing ExecChecks for ${missing.mkString(",")}") } + val checkUtcTz = !nonUtcTimeZoneEnable() namedChecks.foreach { case (fieldName, pc) => val fieldMeta = namedChildExprs(fieldName) @@ -1030,7 +1060,7 @@ class ExecChecks private( .zipWithIndex .map(t => StructField(s"c${t._2}", t._1)) tagUnsupportedTypes(meta, pc.cudf, fieldMeta, - s"unsupported data types in '$fieldName': %s") + s"unsupported data types in '$fieldName': %s", checkUtcTz) } } @@ -1480,7 +1510,9 @@ class CastChecks extends ExprChecks { val cast = meta.wrapped.asInstanceOf[UnaryExpression] val from = cast.child.dataType val to = cast.dataType - if (!gpuCanCast(from, to)) { + + val checkUtc = !meta.conf.nonUtcTimeZoneEnabled + if (!gpuCanCast(from, to, checkUtc)) { willNotWork(s"${meta.wrapped.getClass.getSimpleName} from $from to $to is not supported") } } @@ -1500,9 +1532,23 @@ class CastChecks extends ExprChecks { sparkSig.isSupportedBySpark(to) } - def gpuCanCast(from: DataType, to: DataType): Boolean = { + def gpuCanCast(from: DataType, to: DataType, checkUtcTimeZone: Boolean = true): Boolean = { val (checks, _) = getChecksAndSigs(from) - checks.isSupportedByPlugin(to) + + checks.isSupportedByPlugin(to, checkUtcTimeZone) && + gpuCanCastConsiderTimezone(from, to) + } + + // Check UTC in this method + private def gpuCanCastConsiderTimezone(from: DataType, to: DataType): Boolean = { + // remove this check after non UTC timezone is supported for cast + (from, to) match { + case (StringType, TimestampType | DateType) => TypeChecks.areTimestampsSupported() + case (TimestampType | DateType, StringType) => TypeChecks.areTimestampsSupported() + case (DateType, TimestampType) => TypeChecks.areTimestampsSupported() + case (TimestampType, DateType) => TypeChecks.areTimestampsSupported() + case _ => true + } } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala index b441627c928..931c3144c4c 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/expressions/rapids/Timestamp.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index e0fe58b0857..dff25b27ca4 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -1045,6 +1045,7 @@ class FromUTCTimestampExprMeta( extends BinaryExprMeta[FromUTCTimestamp](expr, conf, parent, rule) { override def tagExprForGpu(): Unit = { + // remove this check after non-UTC timezone is supported extractStringLit(expr.right) match { case None => willNotWorkOnGpu("timezone input must be a literal string") diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala index 5323fc89019..8e6bc4140d1 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala @@ -93,8 +93,10 @@ abstract class GpuShuffleMetaBase( val orderableTypes = GpuOverrides.pluginSupportedOrderableSig + TypeSig.ARRAY.nested(GpuOverrides.gpuCommonTypes) + // shuffle does not require UTC time zone, so skip UTC time zone check + val checkUtc = !conf.nonUtcTimeZoneEnabled shuffle.output.map(_.dataType) - .filterNot(orderableTypes.isSupportedByPlugin) + .filterNot(orderableTypes.isSupportedByPlugin(_, checkUtcTimeZone = checkUtc)) .foreach { dataType => willNotWorkOnGpu(s"round-robin partitioning cannot sort $dataType to run " + s"this on the GPU set ${SQLConf.SORT_BEFORE_REPARTITION.key} to false")