From e9e2e305e6940824d9f76b7dba86b4f69fdf3efa Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 17 Nov 2023 20:46:42 +0800 Subject: [PATCH 01/20] re-organize imports Signed-off-by: Chong Gao --- integration_tests/src/main/python/cast_test.py | 5 ++--- integration_tests/src/main/python/csv_test.py | 5 ++--- integration_tests/src/main/python/hash_aggregate_test.py | 5 +---- integration_tests/src/main/python/hive_write_test.py | 6 ++---- integration_tests/src/main/python/json_test.py | 6 ++---- integration_tests/src/main/python/map_test.py | 5 ++--- integration_tests/src/main/python/orc_test.py | 5 ++--- integration_tests/src/main/python/parquet_test.py | 5 ++--- integration_tests/src/main/python/parquet_write_test.py | 5 ++--- 9 files changed, 17 insertions(+), 30 deletions(-) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index f9bdca6000f..f2f0a619a4a 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -14,10 +14,9 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_spark_exception +from asserts import * from data_gen import * -from spark_session import is_before_spark_320, is_before_spark_330, is_spark_340_or_later, \ - is_databricks113_or_later +from spark_session import * from marks import allow_non_gpu, approximate_float, datagen_overrides from pyspark.sql.types import * from spark_init_internal import spark_version diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 19ad8d29151..e76d19787a2 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -14,14 +14,13 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_write, \ - assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_fallback_collect +from asserts import * from conftest import get_non_gpu_allowed from datetime import datetime, timezone from data_gen import * from marks import * from pyspark.sql.types import * -from spark_session import with_cpu_session, is_before_spark_330, is_spark_350_or_later, is_before_spark_340, is_before_spark_341 +from spark_session import * _acq_schema = StructType([ StructField('loan_id', LongType()), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index a9300a51c79..344e00fc5a0 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -15,10 +15,7 @@ import math import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal,\ - assert_gpu_and_cpu_are_equal_sql,\ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_sql_with_capture,\ - assert_cpu_and_gpu_are_equal_collect_with_capture, run_with_cpu, run_with_cpu_and_gpu +from asserts import * from conftest import is_databricks_runtime from data_gen import * from functools import reduce diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index d7de6f1084e..8325fdfaa92 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -14,14 +14,12 @@ import pytest -from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_fallback_collect, \ - assert_gpu_and_cpu_are_equal_collect, assert_equal, run_with_cpu_and_gpu +from asserts import * from conftest import spark_jvm from data_gen import * from datetime import date, datetime, timezone from marks import * -from spark_session import is_hive_available, is_spark_33X, is_spark_340_or_later, with_cpu_session, \ - is_databricks122_or_later +from spark_session import * # Using timestamps from 1970 to work around a cudf ORC bug # https://github.com/NVIDIA/spark-rapids/issues/140. diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 5b7cee85440..7220ffb4c4e 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -15,14 +15,12 @@ import pyspark.sql.functions as f import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_and_cpu_row_counts_equal, \ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from asserts import * from data_gen import * from datetime import timezone from conftest import is_databricks_runtime from marks import approximate_float, allow_non_gpu, ignore_order -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_340, \ - is_before_spark_341 +from spark_session import * json_supported_gens = [ # Spark does not escape '\r' or '\n' even though it uses it to mark end of record diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py index 8504b38d00d..07b01fbff87 100644 --- a/integration_tests/src/main/python/map_test.py +++ b/integration_tests/src/main/python/map_test.py @@ -14,12 +14,11 @@ import pytest -from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, \ - assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from asserts import * from data_gen import * from conftest import is_databricks_runtime from marks import allow_non_gpu, ignore_order, datagen_overrides -from spark_session import is_before_spark_330, is_databricks104_or_later, is_databricks113_or_later, is_spark_33X, is_spark_340_or_later +from spark_session import * from pyspark.sql.functions import create_map, col, lit, row_number from pyspark.sql.types import * from pyspark.sql.types import IntegralType diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index cbb2ee9e703..337c4de9815 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -14,13 +14,12 @@ import pytest -from asserts import assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, assert_gpu_fallback_collect, \ - assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_sql +from asserts import * from data_gen import * from marks import * from pyspark.sql.types import * from spark_init_internal import spark_version -from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330, is_spark_cdh, is_spark_340_or_later +from spark_session import * from parquet_test import _nested_pruning_schemas from conftest import is_databricks_runtime diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 8efacc18d3e..e124ea7ee4e 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -15,8 +15,7 @@ import pytest -from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture, assert_cpu_and_gpu_are_equal_sql_with_capture, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal, \ - assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_spark_exception +from asserts import * from data_gen import * from parquet_write_test import parquet_nested_datetime_gen, parquet_ts_write_options from marks import * @@ -25,7 +24,7 @@ from pyspark.sql.types import * from pyspark.sql.functions import * from spark_init_internal import spark_version -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_320, is_before_spark_330, is_spark_321cdh +from spark_session import * from conftest import is_databricks_runtime, is_dataproc_runtime diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index bd330b569bb..411f8cdf153 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -14,14 +14,13 @@ import pytest -from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write, assert_spark_exception +from asserts import * from datetime import date, datetime, timezone from data_gen import * from enum import Enum from marks import * from pyspark.sql.types import * -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330, is_before_spark_320, is_spark_cdh, \ - is_databricks_runtime, is_before_spark_340, is_spark_340_or_later, is_databricks122_or_later +from spark_session import * import pyspark.sql.functions as f import pyspark.sql.utils From 64b84712b129d73b4a7d407b37167a815093bc05 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 17 Nov 2023 21:14:08 +0800 Subject: [PATCH 02/20] xfail all the cases when it's non-UTC time zone --- integration_tests/src/main/python/aqe_test.py | 2 ++ .../src/main/python/arithmetic_ops_test.py | 3 ++ .../src/main/python/array_test.py | 21 +++++++++++- integration_tests/src/main/python/ast_test.py | 9 ++++++ .../src/main/python/cache_test.py | 5 +++ .../src/main/python/cast_test.py | 11 +++++++ integration_tests/src/main/python/cmp_test.py | 16 ++++++++++ .../src/main/python/collection_ops_test.py | 16 ++++++++++ .../src/main/python/conditionals_test.py | 10 ++++++ integration_tests/src/main/python/csv_test.py | 6 +++- .../src/main/python/datasourcev2_read_test.py | 6 ++++ .../src/main/python/date_time_test.py | 32 +++++++++++++++++++ .../src/main/python/expand_exec_test.py | 2 ++ .../src/main/python/explain_test.py | 2 ++ .../python/fastparquet_compatibility_test.py | 2 ++ .../src/main/python/generate_expr_test.py | 18 +++++++++++ .../src/main/python/hash_aggregate_test.py | 25 +++++++++++++++ .../src/main/python/hashing_test.py | 3 ++ .../main/python/hive_delimited_text_test.py | 7 +++- .../src/main/python/hive_write_test.py | 3 +- .../src/main/python/join_test.py | 30 ++++++++++++++++- .../src/main/python/json_test.py | 11 +++++++ .../src/main/python/limit_test.py | 2 ++ integration_tests/src/main/python/map_test.py | 20 ++++++++++++ .../src/main/python/mortgage_test.py | 2 ++ .../src/main/python/orc_cast_test.py | 5 +++ integration_tests/src/main/python/orc_test.py | 9 ++++++ .../src/main/python/orc_write_test.py | 7 ++++ .../src/main/python/parquet_test.py | 10 ++++++ .../src/main/python/parquet_write_test.py | 14 ++++++++ .../src/main/python/qa_nightly_select_test.py | 7 ++++ .../src/main/python/repart_test.py | 10 ++++++ .../src/main/python/row-based_udf_test.py | 3 ++ .../src/main/python/row_conversion_test.py | 4 +++ .../src/main/python/sample_test.py | 3 ++ .../src/main/python/schema_evolution_test.py | 2 ++ .../src/main/python/sort_test.py | 15 +++++++++ .../src/main/python/struct_test.py | 3 ++ .../src/main/python/subquery_test.py | 4 +++ .../src/main/python/time_window_test.py | 7 ++++ integration_tests/src/main/python/udf_test.py | 7 +++- .../src/main/python/window_function_test.py | 21 ++++++++++++ 42 files changed, 389 insertions(+), 6 deletions(-) diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index dd683c04fd2..189bef329d7 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -16,6 +16,7 @@ from pyspark.sql.functions import when, col, current_date, current_timestamp from pyspark.sql.types import * from asserts import assert_gpu_and_cpu_are_equal_collect, assert_cpu_and_gpu_are_equal_collect_with_capture +from conftest import is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu from spark_session import with_cpu_session, is_databricks113_or_later @@ -195,6 +196,7 @@ def do_it(spark): @ignore_order(local=True) @allow_non_gpu('BroadcastNestedLoopJoinExec', 'Cast', 'DateSub', *db_113_cpu_bnlj_join_allow) @pytest.mark.parametrize('join', joins, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_aqe_join_reused_exchange_inequality_condition(spark_tmp_path, join): data_path = spark_tmp_path + '/PARQUET_DATA' def prep(spark): diff --git a/integration_tests/src/main/python/arithmetic_ops_test.py b/integration_tests/src/main/python/arithmetic_ops_test.py index cb3c4ebd151..1408894310d 100644 --- a/integration_tests/src/main/python/arithmetic_ops_test.py +++ b/integration_tests/src/main/python/arithmetic_ops_test.py @@ -16,6 +16,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect, assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from marks import ignore_order, incompat, approximate_float, allow_non_gpu, datagen_overrides from pyspark.sql.types import * @@ -985,6 +986,7 @@ def test_columnar_pow(data_gen): lambda spark : binary_op_df(spark, data_gen).selectExpr('pow(a, b)')) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_least(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -1001,6 +1003,7 @@ def test_least(data_gen): f.least(*command_args))) @pytest.mark.parametrize('data_gen', all_basic_gens + _arith_decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_greatest(data_gen): num_cols = 20 s1 = with_cpu_session( diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index ec29dce70d1..49e809f61be 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -16,7 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_error, assert_gpu_fallback_collect from data_gen import * -from conftest import is_databricks_runtime +from conftest import is_databricks_runtime, is_not_utc from marks import incompat from spark_session import is_before_spark_313, is_before_spark_330, is_databricks113_or_later, is_spark_330_or_later, is_databricks104_or_later, is_spark_33X, is_spark_340_or_later, is_spark_330, is_spark_330cdh from pyspark.sql.types import * @@ -103,11 +103,13 @@ @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) @pytest.mark.parametrize('index_gen', array_index_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item(data_gen, index_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, index_gen).selectExpr('a[b]')) @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_lit_ordinal(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -145,6 +147,7 @@ def test_array_item_with_strict_index(strict_index_enabled, index): # No need to test this for multiple data types for array. Only one is enough, but with two kinds of invalid index. @pytest.mark.parametrize('index', [-2, 100, array_neg_index_gen, array_out_index_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_item_ansi_fail_invalid_index(index): message = "SparkArrayIndexOutOfBoundsException" if (is_databricks104_or_later() or is_spark_330_or_later()) else "java.lang.ArrayIndexOutOfBoundsException" if isinstance(index, int): @@ -171,6 +174,7 @@ def test_array_item_ansi_not_fail_all_null_data(): decimal_gen_32bit, decimal_gen_64bit, decimal_gen_128bit, binary_gen, StructGen([['child0', StructGen([['child01', IntegerGen()]])], ['child1', string_gen], ['child2', float_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]], nullable=False)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_array(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -183,6 +187,7 @@ def test_make_array(data_gen): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orderby_array_unique(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark : append_unique_int_col_to_df(spark, unary_op_df(spark, data_gen)), @@ -212,6 +217,7 @@ def test_orderby_array_of_structs(data_gen): @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_contains(data_gen): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen, force_no_nulls=True)) @@ -239,6 +245,7 @@ def test_array_contains_for_nans(data_gen): @pytest.mark.parametrize('data_gen', array_item_test_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_element_at(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, data_gen, array_no_zero_index_gen).selectExpr( @@ -303,6 +310,7 @@ def test_array_element_at_zero_index_fail(index, ansi_enabled): @pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_transform(data_gen): def do_it(spark): columns = ['a', 'b', @@ -337,6 +345,7 @@ def do_it(spark): string_gen, boolean_gen, date_gen, timestamp_gen, null_gen] + decimal_gens @pytest.mark.parametrize('data_gen', array_min_max_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, ArrayGen(data_gen)).selectExpr( @@ -361,6 +370,7 @@ def test_array_concat_decimal(data_gen): 'concat(a, a)'))) @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_column(data_gen): cnt_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[]) cnt_not_null_gen = IntegerGen(min_val=-5, max_val=5, special_cases=[], nullable=False) @@ -374,6 +384,7 @@ def test_array_repeat_with_count_column(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_repeat_with_count_scalar(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -403,6 +414,7 @@ def test_sql_array_scalars(query): @pytest.mark.parametrize('data_gen', all_basic_gens + nested_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_array_struct_fields(data_gen): array_struct_gen = ArrayGen( StructGen([['child0', data_gen], ['child1', int_gen]]), @@ -441,6 +453,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', array_zips_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_zip(data_gen): gen = StructGen( [('a', data_gen), ('b', data_gen), ('c', data_gen), ('d', data_gen)], nullable=False) @@ -514,6 +527,7 @@ def test_array_intersect_spark330(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_intersect_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -554,6 +568,7 @@ def test_array_union(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -594,6 +609,7 @@ def test_array_except(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -635,6 +651,7 @@ def test_arrays_overlap(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens_no_nans + decimal_gens, ids=idfn) @pytest.mark.skipif(not is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap_before_spark313(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -672,6 +689,7 @@ def test_array_remove_scalar(data_gen): FloatGen(special_cases=_non_neg_zero_float_special_cases + [-0.0]), DoubleGen(special_cases=_non_neg_zero_double_special_cases + [-0.0]), StringGen(pattern='[0-9]{1,5}'), boolean_gen, date_gen, timestamp_gen] + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_remove(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -686,6 +704,7 @@ def test_array_remove(data_gen): @pytest.mark.parametrize('data_gen', [ArrayGen(sub_gen) for sub_gen in array_gens_sample], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_flatten_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('flatten(a)') diff --git a/integration_tests/src/main/python/ast_test.py b/integration_tests/src/main/python/ast_test.py index e228712b7a4..32213a496be 100644 --- a/integration_tests/src/main/python/ast_test.py +++ b/integration_tests/src/main/python/ast_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_cpu_and_gpu_are_equal_collect_with_capture +from conftest import is_not_utc from data_gen import * from marks import approximate_float, datagen_overrides from spark_session import with_cpu_session, is_before_spark_330 @@ -70,6 +71,7 @@ def assert_binary_ast(data_descr, func, conf={}): assert_gpu_ast(is_supported, lambda spark: func(binary_op_df(spark, data_gen)), conf=conf) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -79,6 +81,7 @@ def test_literal(spark_tmp_path, data_gen): func=lambda spark: spark.read.parquet(data_path).select(scalar)) @pytest.mark.parametrize('data_gen', [boolean_gen, byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, timestamp_gen, date_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_null_literal(spark_tmp_path, data_gen): # Write data to Parquet so Spark generates a plan using just the count of the data. data_path = spark_tmp_path + '/AST_TEST_DATA' @@ -232,6 +235,7 @@ def test_expm1(data_descr): assert_unary_ast(data_descr, lambda df: df.selectExpr('expm1(a)')) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -241,6 +245,7 @@ def test_eq(data_descr): f.col('a') == f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -250,6 +255,7 @@ def test_ne(data_descr): f.col('a') != f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -260,6 +266,7 @@ def test_lt(data_descr): @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9711') @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -269,6 +276,7 @@ def test_lte(data_descr): f.col('a') <= f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, @@ -278,6 +286,7 @@ def test_gt(data_descr): f.col('a') > f.col('b'))) @pytest.mark.parametrize('data_descr', ast_comparable_descrs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_descr): (s1, s2) = with_cpu_session(lambda spark: gen_scalars(data_descr[0], 2)) assert_binary_ast(data_descr, diff --git a/integration_tests/src/main/python/cache_test.py b/integration_tests/src/main/python/cache_test.py index 662d4d9d8aa..e028e93a959 100644 --- a/integration_tests/src/main/python/cache_test.py +++ b/integration_tests/src/main/python/cache_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_equal +from conftest import is_not_utc from data_gen import * import pyspark.sql.functions as f from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330 @@ -64,6 +65,7 @@ def test_passing_gpuExpr_as_Expr(enable_vectorized_conf): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_join(data_gen, enable_vectorized_conf): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -91,6 +93,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_expand_exec(data_gen, enable_vectorized_conf): def op_df(spark, length=2048): cached = gen_df(spark, StructGen([ @@ -165,6 +168,7 @@ def n_fold(spark): @pytest.mark.parametrize('enable_vectorized', ['true', 'false'], ids=idfn) @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_columnar(spark_tmp_path, data_gen, enable_vectorized, ts_write): data_path_gpu = spark_tmp_path + '/PARQUET_DATA' def read_parquet_cached(data_path): @@ -277,6 +281,7 @@ def helper(spark): @pytest.mark.parametrize('enable_vectorized_conf', enable_vectorized_confs, ids=idfn) @pytest.mark.parametrize('batch_size', [{"spark.rapids.sql.batchSizeBytes": "100"}, {}], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cache_count(data_gen, with_x_session, enable_vectorized_conf, batch_size): test_conf = copy_and_update(enable_vectorized_conf, batch_size) generate_data_and_test_func_on_cached_df(with_x_session, lambda df: df.count(), data_gen, test_conf) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index f2f0a619a4a..d1fa25c60d2 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -15,6 +15,7 @@ import pytest from asserts import * +from conftest import is_not_utc from data_gen import * from spark_session import * from marks import allow_non_gpu, approximate_float, datagen_overrides @@ -150,6 +151,7 @@ def test_cast_string_date_non_ansi(): StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_string_ts_valid_format(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. @@ -297,6 +299,7 @@ def _assert_cast_to_string_equal (data_gen, conf): @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -316,6 +319,7 @@ def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -335,6 +339,7 @@ def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) @pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, @@ -429,6 +434,7 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_for_nan_inf(type): def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] @@ -448,6 +454,7 @@ def fun(spark): short_gen, int_gen, long_gen_to_timestamp], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_integral_to_timestamp(gen, ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -456,6 +463,7 @@ def test_cast_integral_to_timestamp(gen, ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp(ansi_enabled): if(is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -465,6 +473,7 @@ def test_cast_float_to_timestamp(ansi_enabled): conf={"spark.sql.ansi.enabled": ansi_enabled}) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_double_to_timestamp(ansi_enabled): if (is_before_spark_330() and ansi_enabled): # 330- does not support in ANSI mode pytest.skip() @@ -500,12 +509,14 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_numeric_non_ansi(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_string(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index e21c2e68a2f..dc0527f8f0d 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import with_cpu_session, is_before_spark_330 from pyspark.sql.types import * @@ -22,6 +23,7 @@ import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -55,6 +57,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_eq_ns(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -82,6 +85,7 @@ def test_eq_ns_for_interval(): f.col('a').eqNullSafe(f.col('b')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ne(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -115,6 +119,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -148,6 +153,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -181,6 +187,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gt(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -214,6 +221,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', orderable_gens + struct_gens_sample_with_decimal128_no_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gte(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -247,6 +255,7 @@ def test_func(data_gen): test_func(data_gen) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_isnull(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( @@ -266,23 +275,27 @@ def test_isnan(data_gen): f.isnan(f.col('a')))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_any(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna()) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + [binary_gen] + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dropna_all(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).dropna(how='all')) #dropna is really a filter along with a test for null, but lets do an explicit filter test too @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, BooleanGen(), data_gen, data_gen).filter(f.col('a'))) # coalesce batch happens after a filter, but only if something else happens on the GPU after that @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_filter_with_project(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : two_col_df(spark, BooleanGen(), data_gen).filter(f.col('a')).selectExpr('*', 'a as a2')) @@ -292,6 +305,7 @@ def test_filter_with_project(data_gen): # and some constants that then make it so all we need is the number of rows # of input. @pytest.mark.parametrize('op', ['>', '<']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_empty_filter(op, spark_tmp_path): def do_it(spark): @@ -320,6 +334,7 @@ def test_filter_with_lit(expr): # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries under that value. @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_in(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) - 1 @@ -332,6 +347,7 @@ def test_in(data_gen): # This is to test entries over that value. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 971523248ab..43cc782df0f 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from string_test import mk_str_gen @@ -34,6 +35,7 @@ for sub_gen in all_gen + [null_gen]] @pytest.mark.parametrize('data_gen', non_nested_array_gens + nested_array_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen).selectExpr( @@ -44,6 +46,7 @@ def test_concat_list(data_gen): ) @pytest.mark.parametrize('dg', non_nested_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_double_list_with_lit(dg): data_gen = ArrayGen(dg, max_length=2) array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -67,6 +70,7 @@ def test_concat_double_list_with_lit(dg): @pytest.mark.parametrize('data_gen', non_nested_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_concat_list_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -95,6 +99,7 @@ def test_concat_string(): f.concat(f.col('a'), f.lit('')))) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: three_col_df(spark, data_gen, data_gen, data_gen @@ -106,6 +111,7 @@ def test_map_concat(data_gen): ) @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_64_map_gens + decimal_128_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_concat_with_lit(data_gen): lit_col1 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) lit_col2 = with_cpu_session(lambda spark: f.lit(gen_scalar(data_gen))).cast(data_gen.data_type) @@ -119,6 +125,7 @@ def test_map_concat_with_lit(data_gen): @pytest.mark.parametrize('data_gen', all_gen + nested_gens, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_array(data_gen, size_of_null): gen = ArrayGen(data_gen) assert_gpu_and_cpu_are_equal_collect( @@ -127,12 +134,14 @@ def test_size_of_array(data_gen, size_of_null): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) @pytest.mark.parametrize('size_of_null', ['true', 'false'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_size_of_map(data_gen, size_of_null): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('size(a)'), conf={'spark.sql.legacy.sizeOfNull': size_of_null}) @pytest.mark.parametrize('data_gen', array_gens_sample + [string_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reverse(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('reverse(a)')) @@ -143,6 +152,7 @@ def test_reverse(data_gen): ] @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select( @@ -150,6 +160,7 @@ def test_sort_array(data_gen): f.sort_array(f.col('a'), False))) @pytest.mark.parametrize('data_gen', _sort_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sort_array_lit(data_gen): array_lit = with_cpu_session(lambda spark: gen_scalar(data_gen)) assert_gpu_and_cpu_are_equal_collect( @@ -250,6 +261,7 @@ def test_sort_array_normalize_nans(): gens in sequence_normal_integral_gens] @pytest.mark.parametrize('start_gen,stop_gen', sequence_normal_no_step_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_without_step(start_gen, stop_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, start_gen, stop_gen).selectExpr( @@ -258,6 +270,7 @@ def test_sequence_without_step(start_gen, stop_gen): "sequence(20, b)")) @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_normal_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_with_step(start_gen, stop_gen, step_gen): # Get the datagen seed we use for all datagens, since we need to call start # on step_gen @@ -304,6 +317,7 @@ def test_sequence_with_step(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('start_gen,stop_gen,step_gen', sequence_illegal_boundaries_integral_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( lambda spark:three_col_df(spark, start_gen, stop_gen, step_gen).selectExpr( @@ -318,6 +332,7 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): ] @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. @@ -359,6 +374,7 @@ def get_sequence_data(gen, len): mixed_schema) # test for 3 cases mixed in a single dataset +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sequence_with_step_mixed_cases(): assert_gpu_and_cpu_are_equal_collect( lambda spark: get_sequence_cases_mixed_df(spark) diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index c819a64f549..de9c50546a2 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_320, is_jvm_charset_utf8 from pyspark.sql.types import * @@ -44,6 +45,7 @@ def mk_str_gen(pattern): if_nested_gens = if_array_gens_sample + if_struct_gens_sample @pytest.mark.parametrize('data_gen', all_gens + if_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -63,6 +65,7 @@ def test_if_else(data_gen): # Maps scalars are not really supported by Spark from python without jumping through a lot of hoops # so for now we are going to skip them @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_if_else_map(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : three_col_df(spark, boolean_gen, data_gen, data_gen).selectExpr( @@ -72,6 +75,7 @@ def test_if_else_map(data_gen): @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9685') @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_case_when(data_gen): num_cmps = 20 s1 = with_cpu_session( @@ -115,6 +119,7 @@ def test_nanvl(data_gen): f.nanvl(f.lit(float('nan')).cast(data_type), f.col('b')))) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -134,6 +139,7 @@ def test_nvl(data_gen): # at least one `BoundReference` @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9684') @pytest.mark.parametrize('data_gen', all_gens + all_nested_gens_nonempty_struct + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce(data_gen): num_cols = 20 s1 = with_cpu_session( @@ -155,6 +161,7 @@ def test_coalesce_constant_output(): lambda spark : spark.range(1, 100).selectExpr("4 + coalesce(5, id) as nine")) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nvl2(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -168,6 +175,7 @@ def test_nvl2(data_gen): 'nvl2(a, {}, c)'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_nullif(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -181,6 +189,7 @@ def test_nullif(data_gen): 'nullif(a, {})'.format(null_lit))) @pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ifnull(data_gen): (s1, s2) = with_cpu_session( lambda spark: gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen))) @@ -232,6 +241,7 @@ def test_conditional_with_side_effects_case_when(data_gen): conf = test_conf) @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_conditional_with_side_effects_sequence(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index e76d19787a2..74dab108c25 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import get_non_gpu_allowed +from conftest import get_non_gpu_allowed, is_not_utc from datetime import datetime, timezone from data_gen import * from marks import * @@ -248,6 +248,7 @@ def read_impl(spark): @pytest.mark.parametrize('read_func', [read_csv_df, read_csv_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) @pytest.mark.parametrize('ansi_enabled', ["true", "false"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_csv_read(std_input_path, name, schema, options, read_func, v1_enabled_list, ansi_enabled, spark_tmp_table_factory): updated_conf=copy_and_update(_enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -288,6 +289,7 @@ def test_csv_read_small_floats(std_input_path, name, schema, options, read_func, @approximate_float @pytest.mark.parametrize('data_gen', csv_supported_gens, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_round_trip(spark_tmp_path, data_gen, v1_enabled_list): gen = StructGen([('a', data_gen)], nullable=False) data_path = spark_tmp_path + '/CSV_DATA' @@ -404,6 +406,7 @@ def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, @pytest.mark.parametrize('ts_part', csv_supported_ts_parts) @pytest.mark.parametrize('date_format', csv_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "csv"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -474,6 +477,7 @@ def test_input_meta_fallback(spark_tmp_path, v1_enabled_list, disable_conf): conf=updated_conf) @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/CSV_DATA' diff --git a/integration_tests/src/main/python/datasourcev2_read_test.py b/integration_tests/src/main/python/datasourcev2_read_test.py index c4834a53c1c..cc141700cb8 100644 --- a/integration_tests/src/main/python/datasourcev2_read_test.py +++ b/integration_tests/src/main/python/datasourcev2_read_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_row_counts_equal +from conftest import is_not_utc from marks import * columnarClass = 'com.nvidia.spark.rapids.tests.datasourcev2.parquet.ArrowColumnarDataSourceV2' @@ -26,26 +27,31 @@ def readTable(types, classToUse): .orderBy("col1") @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_int(): assert_gpu_and_cpu_are_equal_collect(readTable("int", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_strings(): assert_gpu_and_cpu_are_equal_collect(readTable("string", columnarClass)) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_all_types_count(): assert_gpu_and_cpu_row_counts_equal( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), conf={'spark.rapids.sql.castFloatToString.enabled': 'true'}) @validate_execs_in_gpu_plan('HostColumnarToGpu') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_arrow_off(): assert_gpu_and_cpu_are_equal_collect( readTable("int,bool,byte,short,long,string,float,double,date,timestamp", columnarClass), diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 0e58be01c44..971ee0c9b05 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import ignore_order, incompat, allow_non_gpu @@ -25,6 +26,7 @@ vals = [(-584, 1563), (1943, 1101), (2693, 2167), (2729, 0), (44, 1534), (2635, 3319), (1885, -2828), (0, 2463), (932, 2286), (0, 0)] @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timesub(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -33,6 +35,7 @@ def test_timesub(data_gen): .selectExpr("a - (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -59,6 +62,7 @@ def test_interval_seconds_overflow_exception(): error_message="IllegalArgumentException") @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd_from_subquery(data_gen): def fun(spark): @@ -70,6 +74,7 @@ def fun(spark): assert_gpu_and_cpu_are_equal_collect(fun) @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timesub_from_subquery(data_gen): def fun(spark): @@ -85,6 +90,7 @@ def fun(spark): # [SPARK-34896][SQL] Return day-time interval from dates subtraction # 1. Add the SQL config `spark.sql.legacy.interval.enabled` which will control when Spark SQL should use `CalendarIntervalType` instead of ANSI intervals. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateaddinterval(data_gen): days, seconds = data_gen assert_gpu_and_cpu_are_equal_collect( @@ -95,6 +101,7 @@ def test_dateaddinterval(data_gen): # test add days(not specify hours, minutes, seconds, milliseconds, microseconds) in ANSI mode. @pytest.mark.parametrize('data_gen', vals, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateaddinterval_ansi(data_gen): days, _ = data_gen # only specify the `days` @@ -122,14 +129,17 @@ def test_datediff(data_gen): 'datediff(a, date(null))', 'datediff(a, \'2016-03-02\')')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hour(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('hour(a)')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_minute(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('minute(a)')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_second(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('second(a)')) @@ -188,6 +198,7 @@ def test_datesub(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(min_val=-106032829, max_val=103819094, special_cases=[-106032829, 103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_dateadd_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -201,6 +212,7 @@ def test_dateadd_with_date_overflow(data_gen): to_unix_timestamp_days_gen=[ByteGen(), ShortGen(), IntegerGen(max_val=106032829, min_val=-103819094, special_cases=[106032829, -103819094,0,1,-1])] @pytest.mark.parametrize('data_gen', to_unix_timestamp_days_gen, ids=idfn) @incompat +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_datesub_with_date_overflow(data_gen): string_type = to_cast_string(data_gen.data_type) assert_gpu_and_cpu_are_equal_collect( @@ -232,6 +244,7 @@ def test_dayofyear(data_gen): lambda spark : unary_op_df(spark, data_gen).select(f.dayofyear(f.col('a')))) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col('a')))) @@ -248,6 +261,7 @@ def test_unsupported_fallback_unix_timestamp(data_gen): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("to_unix_timestamp(a)"), @@ -265,6 +279,7 @@ def test_unsupported_fallback_to_unix_timestamp(data_gen): @pytest.mark.parametrize('time_zone', ["UTC", "UTC+0", "UTC-0", "GMT", "GMT+0", "GMT-0"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_utc_timestamp(data_gen, time_zone): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.from_utc_timestamp(f.col('a'), time_zone))) @@ -329,6 +344,7 @@ def fun(spark): @pytest.mark.parametrize('parser_policy', ["CORRECTED", "EXCEPTION"], ids=idfn) # first get expected string via `date_format` +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_to_timestamp_functions_ansi_valid(parser_policy): expr_format = "{operator}(date_format(a, '{fmt}'), '{fmt}')" formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM', 'dd/MM/yyyy', 'yyyy-MM-dd HH:mm:ss', @@ -346,6 +362,7 @@ def fun(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true", "spark.sql.legacy.timeParserPolicy": "CORRECTED"} @@ -362,6 +379,7 @@ def test_unix_timestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_unix_timestamp_improved(data_gen, ansi_enabled): conf = {"spark.rapids.sql.improvedTimeOps.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -380,6 +398,7 @@ def invalid_date_string_df(spark): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_to_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).selectExpr("to_unix_timestamp(a, '{}')".format(date_form)), @@ -393,6 +412,7 @@ def test_string_to_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen,date_form', str_date_and_format_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_string_unix_timestamp(data_gen, date_form, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen, seed=1).select(f.unix_timestamp(f.col('a'), date_form)), @@ -406,6 +426,7 @@ def test_string_unix_timestamp_ansi_exception(): @pytest.mark.parametrize('data_gen', [StringGen('200[0-9]-0[1-9]-[0-2][1-8]')], ids=idfn) @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gettimestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "yyyy-MM-dd")), @@ -413,6 +434,7 @@ def test_gettimestamp(data_gen, ansi_enabled): @pytest.mark.parametrize('data_gen', [StringGen('0[1-9]200[0-9]')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_gettimestamp_format_MMyyyy(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.to_date(f.col("a"), "MMyyyy"))) @@ -427,6 +449,7 @@ def test_gettimestamp_ansi_exception(): 'MM-dd', 'MM/dd', 'dd-MM', 'dd/MM'] @pytest.mark.parametrize('date_format', supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format(data_gen, date_format): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("date_format(a, '{}')".format(date_format))) @@ -461,6 +484,7 @@ def test_date_format_maybe(data_gen, date_format): @pytest.mark.parametrize('date_format', maybe_supported_date_formats, ids=idfn) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format_maybe_incompat(data_gen, date_format): conf = {"spark.rapids.sql.incompatibleDateFormats.enabled": "true"} assert_gpu_and_cpu_are_equal_collect( @@ -472,6 +496,7 @@ def test_date_format_maybe_incompat(data_gen, date_format): # input_file_name(), otherwise filter happens before project. @allow_non_gpu('CollectLimitExec,FileSourceScanExec,DeserializeToObjectExec') @ignore_order() +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_date_format_mmyyyy_cast_canonicalization(spark_tmp_path): data_path = spark_tmp_path + '/CSV_DATA' gen = StringGen(pattern='[0][0-9][1][8-9][1-9][1-9]', nullable=False) @@ -517,10 +542,12 @@ def test_unsupported_fallback_to_date(): seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(), DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)] @pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -528,6 +555,7 @@ def test_timestamp_seconds_long_overflow(): error_message='long overflow') @pytest.mark.parametrize('data_gen', [DecimalGen(7, 7), DecimalGen(20, 7)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_rounding_necessary(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -535,6 +563,7 @@ def test_timestamp_seconds_rounding_necessary(data_gen): error_message='Rounding necessary') @pytest.mark.parametrize('data_gen', [DecimalGen(19, 6), DecimalGen(20, 6)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_seconds_decimal_overflow(data_gen): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_seconds(a)").collect(), @@ -543,10 +572,12 @@ def test_timestamp_seconds_decimal_overflow(data_gen): millis_gens = [LongGen(min_val=-62135510400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', millis_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_millis(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_millis(a)")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_millis_long_overflow(): assert_gpu_and_cpu_error( lambda spark : unary_op_df(spark, long_gen).selectExpr("timestamp_millis(a)").collect(), @@ -555,6 +586,7 @@ def test_timestamp_millis_long_overflow(): micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()] @pytest.mark.parametrize('data_gen', micros_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_micros(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr("timestamp_micros(a)")) diff --git a/integration_tests/src/main/python/expand_exec_test.py b/integration_tests/src/main/python/expand_exec_test.py index d53000e9849..abb9a7bd094 100644 --- a/integration_tests/src/main/python/expand_exec_test.py +++ b/integration_tests/src/main/python/expand_exec_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_equal +from conftest import is_not_utc from data_gen import * import pyspark.sql.functions as f from marks import ignore_order @@ -22,6 +23,7 @@ # Many Spark versions have issues sorting large decimals, # see https://issues.apache.org/jira/browse/SPARK-40089. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_expand_exec(data_gen): def op_df(spark, length=2048): return gen_df(spark, StructGen([ diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py index b84754a3d3f..1837f31aa95 100644 --- a/integration_tests/src/main/python/explain_test.py +++ b/integration_tests/src/main/python/explain_test.py @@ -14,6 +14,7 @@ import pytest +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.functions import * @@ -49,6 +50,7 @@ def do_join_explain(spark): with_cpu_session(do_join_explain) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explain_set_config(): conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'} diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index d2636d58d01..2bf23e6b9a2 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from fastparquet_utils import get_fastparquet_result_canonicalizer from spark_session import spark_version, with_cpu_session, with_gpu_session @@ -378,6 +379,7 @@ def write_with_fastparquet(spark, data_gen): marks=pytest.mark.xfail(reason="fastparquet fails to read nullable Struct columns written from Apache Spark. " "It fails the rewrite to parquet, thereby failing the test.")), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_rewritten_with_fastparquet(column_gen, time_format, spark_tmp_path): """ This test is a workaround to test data-types that have problems being converted diff --git a/integration_tests/src/main/python/generate_expr_test.py b/integration_tests/src/main/python/generate_expr_test.py index 46ac5c92350..cde16352236 100644 --- a/integration_tests/src/main/python/generate_expr_test.py +++ b/integration_tests/src/main/python/generate_expr_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from pyspark.sql.types import * @@ -37,6 +38,7 @@ def four_op_df(spark, gen, length=2048): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('a', 'explode(array(b, c, d))')) @@ -45,6 +47,7 @@ def test_explode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -60,6 +63,7 @@ def test_explode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + map_gens_sample + arrays_with_binary + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -70,6 +74,7 @@ def test_explode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -80,6 +85,7 @@ def test_explode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -94,6 +100,7 @@ def test_explode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -104,6 +111,7 @@ def test_explode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -114,6 +122,7 @@ def test_explode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_explode_outer_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -125,6 +134,7 @@ def test_explode_outer_nested_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_makearray(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a')) @@ -133,6 +143,7 @@ def test_posexplode_makearray(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_litarray(data_gen): array_lit = with_cpu_session( lambda spark: gen_scalar(ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))) @@ -147,6 +158,7 @@ def test_posexplode_litarray(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -157,6 +169,7 @@ def test_posexplode_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -167,6 +180,7 @@ def test_posexplode_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_nested_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -181,6 +195,7 @@ def test_posexplode_nested_array_data(data_gen): @pytest.mark.parametrize('data_gen', explode_gens + struct_gens_sample_with_decimal128 + array_gens_sample + arrays_with_binary + map_gens_sample + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(data_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -191,6 +206,7 @@ def test_posexplode_outer_array_data(data_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('map_gen', map_gens_sample + decimal_128_map_gens + maps_with_binary, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_outer_map_data(map_gen): data_gen = [int_gen, map_gen] assert_gpu_and_cpu_are_equal_collect( @@ -201,6 +217,7 @@ def test_posexplode_outer_map_data(map_gen): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', explode_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_posexplode_nested_outer_array_data(data_gen): data_gen = [int_gen, ArrayGen(ArrayGen(data_gen))] assert_gpu_and_cpu_are_equal_collect( @@ -225,6 +242,7 @@ def test_stack(): # gpu stack not guarantee to produce the same output order as Spark does @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_stack_mixed_types(): base_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen, null_gen, DecimalGen(precision=7, scale=3), diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 344e00fc5a0..e80dbcbde3e 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -17,6 +17,7 @@ from asserts import * from conftest import is_databricks_runtime +from conftest import is_not_utc from data_gen import * from functools import reduce from pyspark.sql.types import * @@ -333,6 +334,7 @@ def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, overri @allow_non_gpu("SortAggregateExec", "SortExec", "ShuffleExchangeExec") @ignore_order @pytest.mark.parametrize('data_gen', _grpkey_nested_structs_with_array_basic_child + _grpkey_list_with_non_nested_children, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_grpby_list_min_max(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).coalesce(1).groupby('a').agg(f.min('b'), f.max('b')) @@ -614,6 +616,7 @@ def test_decimal128_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_min_max_group_by(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) @@ -629,6 +632,7 @@ def test_min_max_group_by(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_list_op, ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg): def doit(spark): df = gen_df(spark, data_gen, length=100)\ @@ -660,6 +664,7 @@ def doit(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -668,6 +673,7 @@ def test_hash_groupby_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -682,6 +688,7 @@ def test_hash_groupby_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -703,6 +710,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -710,6 +718,7 @@ def test_hash_reduction_collect_set(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set_on_nested_type(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -723,6 +732,7 @@ def test_hash_reduction_collect_set_on_nested_type(data_gen): @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_reduction_collect_set_on_nested_array_type(data_gen): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -742,6 +752,7 @@ def do_it(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_with_single_distinct(data_gen): # test collect_ops with other distinct aggregations assert_gpu_and_cpu_are_equal_collect( @@ -754,6 +765,7 @@ def test_hash_groupby_collect_with_single_distinct(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_single_distinct_collect(data_gen): # test distinct collect sql = """select a, @@ -777,6 +789,7 @@ def test_hash_groupby_single_distinct_collect(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_with_multi_distinct(data_gen): def spark_fn(spark_session): return gen_df(spark_session, data_gen, length=100).groupby('a').agg( @@ -803,6 +816,7 @@ def spark_fn(spark_session): @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_fallback(data_gen, replace_mode, aqe_enabled, @@ -850,6 +864,7 @@ def test_hash_groupby_collect_partial_replace_fallback(data_gen, @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_groupby_collect_partial_replace_with_distinct_fallback(data_gen, replace_mode, aqe_enabled, @@ -1248,6 +1263,7 @@ def test_first_last_reductions_decimal_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_first_last_reductions_nested_types(data_gen): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic @@ -1256,6 +1272,7 @@ def test_first_last_reductions_nested_types(data_gen): 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_generic_reductions(data_gen): local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) assert_gpu_and_cpu_are_equal_collect( @@ -1273,6 +1290,7 @@ def test_generic_reductions(data_gen): conf=local_conf) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_count(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen) \ @@ -1284,6 +1302,7 @@ def test_count(data_gen): conf = {'spark.sql.legacy.allowParameterlessCount': 'true'}) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_distinct_count_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( @@ -1307,6 +1326,7 @@ def test_arithmetic_reductions(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_collect_list_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( # coalescing because collect_list is not deterministic @@ -1325,6 +1345,7 @@ def test_collect_list_reductions(data_gen): @pytest.mark.parametrize('data_gen', _no_neg_zero_all_basic_gens + decimal_gens + _struct_only_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_collect_set_reductions(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('sort_array(collect_set(a))'), @@ -1338,6 +1359,7 @@ def test_collect_empty(): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] agg_fn = lambda df: df.groupBy('a').agg( @@ -1351,6 +1373,7 @@ def test_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _struct_only_nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sorted_groupby_first_last(data_gen): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] # sort by more than the group by columns to be sure that first/last don't remove the ordering @@ -1368,6 +1391,7 @@ def test_sorted_groupby_first_last(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_agg_count(data_gen, count_func): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], @@ -2024,6 +2048,7 @@ def test_std_variance_partial_replace_fallback(data_gen, null_gen] + array_gens_sample + struct_gens_sample @ignore_order(local=True) @pytest.mark.parametrize('data_gen', gens_for_max_min, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_min_max_in_groupby_and_reduction(data_gen): df_gen = [('a', data_gen), ('b', RepeatSeqGen(IntegerGen(), length=20))] diff --git a/integration_tests/src/main/python/hashing_test.py b/integration_tests/src/main/python/hashing_test.py index 6bd56da933d..e2a753ecaeb 100644 --- a/integration_tests/src/main/python/hashing_test.py +++ b/integration_tests/src/main/python/hashing_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, ignore_order from spark_session import is_before_spark_320 @@ -46,11 +47,13 @@ @ignore_order(local=True) @pytest.mark.parametrize("gen", _xxhash_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_single_column(gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, gen).selectExpr("a", "xxhash64(a)")) @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_xxhash64_multi_column(): gen = StructGen(_struct_of_xxhash_gens.children, nullable=False) col_list = ",".join(gen.data_type.fieldNames()) diff --git a/integration_tests/src/main/python/hive_delimited_text_test.py b/integration_tests/src/main/python/hive_delimited_text_test.py index e316f0df934..4d07a077ec0 100644 --- a/integration_tests/src/main/python/hive_delimited_text_test.py +++ b/integration_tests/src/main/python/hive_delimited_text_test.py @@ -13,7 +13,7 @@ # limitations under the License. from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_and_cpu_sql_writes_are_equal_collect, assert_gpu_fallback_collect -from conftest import get_non_gpu_allowed +from conftest import get_non_gpu_allowed, is_not_utc from data_gen import * from enum import Enum from marks import * @@ -187,6 +187,7 @@ def read_impl(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_hive_text_read(std_input_path, name, schema, spark_tmp_table_factory, options): assert_gpu_and_cpu_are_equal_collect(read_hive_text_sql(std_input_path + '/' + name, schema, spark_tmp_table_factory, options), @@ -239,6 +240,7 @@ def read_hive_text_table(spark, text_table_name, fields="my_field"): "https://github.com/NVIDIA/spark-rapids/pull/7628") @approximate_float @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -282,6 +284,7 @@ def read_hive_text_table_partitions(spark, text_table_name, partition): @approximate_float @allow_non_gpu("EqualTo,IsNotNull,Literal") # Accounts for partition predicate: `WHERE dt='1'` @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_table_factory): gen = StructGen([('my_field', data_gen)], nullable=False) data_path = spark_tmp_path + '/hive_text_table' @@ -300,6 +303,7 @@ def test_hive_text_round_trip_partitioned(spark_tmp_path, data_gen, spark_tmp_ta @approximate_float @allow_non_gpu("EqualTo,IsNotNull,Literal,Or") # Accounts for partition predicate @pytest.mark.parametrize('data_gen', hive_text_supported_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_text_round_trip_two_partitions(spark_tmp_path, data_gen, spark_tmp_table_factory): """ Added to reproduce: https://github.com/NVIDIA/spark-rapids/issues/7383 @@ -525,6 +529,7 @@ def create_table_with_compressed_files(spark): ('hive-delim-text/carriage-return', StructType([StructField("str", StringType())]), {}), ('hive-delim-text/carriage-return-err', StructType([StructField("str", StringType())]), {}), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_hive_text_write(std_input_path, input_dir, schema, spark_tmp_table_factory, mode, options): # Configure table options, including schema. if options is None: diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 8325fdfaa92..7bc5ceede85 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import spark_jvm +from conftest import spark_jvm, is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import * @@ -59,6 +59,7 @@ def _restricted_timestamp(nullable=True): @pytest.mark.skipif(not is_hive_available(), reason="Hive is missing") @pytest.mark.parametrize("gens", _write_gens, ids=idfn) @pytest.mark.parametrize("storage", ["PARQUET", "nativeorc", "hiveorc"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_optimized_hive_ctas_basic(gens, storage, spark_tmp_table_factory): data_table = spark_tmp_table_factory.get() gen_list = [('c' + str(i), gen) for i, gen in enumerate(gens)] diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index 5ee48dbb0bd..eec696da538 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -17,7 +17,7 @@ from pyspark.sql.functions import array_contains, broadcast, col from pyspark.sql.types import * from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture -from conftest import is_databricks_runtime, is_emr_runtime +from conftest import is_databricks_runtime, is_emr_runtime, is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime @@ -170,6 +170,7 @@ def do_join(spark): (all_gen, '1g'), (join_small_batch_gens, '1000')), ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -180,6 +181,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -193,6 +195,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_wrong_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) @@ -212,6 +215,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) @pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_join_ridealong(data_gen, join_type, sub_part_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 50, 500) @@ -228,6 +232,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -239,6 +244,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table_ridealong(data_gen, join_type): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) @@ -252,6 +258,7 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_table_with_job_group(data_gen, join_type): with_cpu_session(lambda spark : spark.sparkContext.setJobGroup("testjob1", "test", False)) def do_join(spark): @@ -266,6 +273,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)], '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cartesian_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -305,6 +313,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cartesian_join_with_condition(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -322,6 +331,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join(data_gen, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -357,6 +367,7 @@ def do_join(spark): (join_ast_gen, '1g'), ([int_gen], 100)), ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -371,6 +382,7 @@ def do_join(spark): # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -429,6 +441,7 @@ def do_join(spark): float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen)) @@ -441,6 +454,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -456,6 +470,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -470,6 +485,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -478,6 +494,7 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -488,6 +505,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['LeftOuter', 'LeftSemi', 'LeftAnti', 'FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -498,6 +516,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['RightOuter', 'FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) @@ -514,6 +533,7 @@ def do_join(spark): # Specify 200 shuffle partitions to test cases where streaming side is empty # as in https://github.com/NVIDIA/spark-rapids/issues/7516 @pytest.mark.parametrize('shuffle_conf', [{}, {'spark.sql.shuffle.partitions': 200}], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf): def do_join(spark): left, right = create_df(spark, data_gen, 250, 500) @@ -525,6 +545,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_with_conditionals(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -579,6 +600,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_with_condition_ast(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -695,6 +717,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -706,6 +729,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -718,6 +742,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -732,6 +757,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_struct_as_key(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) @@ -743,6 +769,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_broadcast_join_right_struct_mixed_key(data_gen, join_type): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) @@ -767,6 +794,7 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['FullOuter'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 7220ffb4c4e..3d0c50401ba 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -17,6 +17,7 @@ from asserts import * from data_gen import * +from conftest import is_not_utc from datetime import timezone from conftest import is_databricks_runtime from marks import approximate_float, allow_non_gpu, ignore_order @@ -183,6 +184,7 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize('v1_enabled_list', ["", "json"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_enabled_list): full_format = date_format + ts_part data_gen = TimestampGen() @@ -393,6 +395,7 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an 'CORRECTED', 'EXCEPTION' ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, \ spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, @@ -450,6 +453,7 @@ def test_json_read_count(spark_tmp_path, v1_enabled_list): lambda spark : spark.read.schema(schema).json(data_path), conf=updated_conf) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_map(): # The test here is working around some inconsistencies in how the keys are parsed for maps # on the GPU the keys are dense, but on the CPU they are sparse @@ -484,6 +488,7 @@ def test_from_json_map_fallback(): 'struct', 'struct', ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct(schema): # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/9588 json_string_gen = StringGen(r'{"a": [1-9]{0,5}, "b": "[A-Z]{0,5}", "c": 1\d\d\d}') \ @@ -503,6 +508,7 @@ def test_from_json_struct(schema): r'{ "bool": [0-9]{4}-[0-9]{2}-[0-9]{2} }', r'{ "bool": "[0-9]{4}-[0-9]{2}-[0-9]{2}" }' ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_boolean(pattern): json_string_gen = StringGen(pattern) \ .with_special_case('', weight=50) \ @@ -512,6 +518,7 @@ def test_from_json_struct_boolean(pattern): .select(f.col('a'), f.from_json('a', 'struct')), conf={"spark.rapids.sql.expression.JsonToStructs": True}) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_decimal(): json_string_gen = StringGen(r'{ "a": "[+-]?([0-9]{0,5})?(\.[0-9]{0,2})?([eE][+-]?[0-9]{1,2})?" }') \ .with_special_pattern('', weight=50) \ @@ -524,6 +531,7 @@ def test_from_json_struct_decimal(): @pytest.mark.parametrize('schema', ['struct', 'struct>', 'struct>']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_of_struct(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": {"name": "[A-Z]{1}[a-z]{2,5}", "age": 1\d}}') \ @@ -538,6 +546,7 @@ def test_from_json_struct_of_struct(schema): @pytest.mark.parametrize('schema', ['struct', 'struct>>', 'struct>>']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_of_list(schema): json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ r'"student": \[{"name": "[A-Z]{1}[a-z]{2,5}", "class": "junior"},' \ @@ -550,6 +559,7 @@ def test_from_json_struct_of_list(schema): conf={"spark.rapids.sql.expression.JsonToStructs": True}) @pytest.mark.parametrize('schema', ['struct', 'struct']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_all_empty_string_input(schema): json_string_gen = StringGen('') assert_gpu_and_cpu_are_equal_collect( @@ -626,6 +636,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')), False ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty): struct_gen = StructGen([ ('a', data_gen), diff --git a/integration_tests/src/main/python/limit_test.py b/integration_tests/src/main/python/limit_test.py index 5e116b00654..369c4dc2ab1 100644 --- a/integration_tests/src/main/python/limit_test.py +++ b/integration_tests/src/main/python/limit_test.py @@ -15,11 +15,13 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from spark_session import is_before_spark_340 from marks import allow_non_gpu, approximate_float @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample + struct_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( # We need some processing after the limit to avoid a CollectLimitExec diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py index 07b01fbff87..5daeb916e22 100644 --- a/integration_tests/src/main/python/map_test.py +++ b/integration_tests/src/main/python/map_test.py @@ -15,6 +15,7 @@ import pytest from asserts import * +from conftest import is_not_utc from data_gen import * from conftest import is_databricks_runtime from marks import allow_non_gpu, ignore_order, datagen_overrides @@ -56,6 +57,7 @@ @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -67,6 +69,7 @@ def test_map_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_values(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -78,6 +81,7 @@ def test_map_values(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_entries(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -113,6 +117,7 @@ def decimal_value_gen(): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_string_keys(data_gen): index_gen = StringGen() assert_gpu_and_cpu_are_equal_collect( @@ -136,6 +141,7 @@ def test_get_map_value_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_numeric_keys(data_gen): key_gen = data_gen._key_gen assert_gpu_and_cpu_are_equal_collect( @@ -149,6 +155,7 @@ def test_get_map_value_numeric_keys(data_gen): @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_supported_keys(data_gen): key_gen = data_gen._key_gen # first expression is not guaranteed to hit @@ -187,6 +194,7 @@ def query_map_scalar(spark): @allow_non_gpu('WindowLocalExec') @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9683') @pytest.mark.parametrize('data_gen', supported_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_scalars_supported_key_types(data_gen): key_gen = data_gen._key_gen def query_map_scalar(spark): @@ -224,6 +232,7 @@ def query_map_scalar(spark): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -235,6 +244,7 @@ def test_get_map_value_date_keys(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -253,6 +263,7 @@ def test_map_side_effects(): @pytest.mark.parametrize('key_gen', [StringGen(nullable=False), IntegerGen(nullable=False), basic_struct_gen], ids=idfn) @pytest.mark.parametrize('value_gen', [StringGen(nullable=True), IntegerGen(nullable=True), basic_struct_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_entry_map(key_gen, value_gen): data_gen = [('a', key_gen), ('b', value_gen)] assert_gpu_and_cpu_are_equal_collect( @@ -456,6 +467,7 @@ def test_simple_get_map_value_with_strict_index(strict_index, data_gen): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_string_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -469,6 +481,7 @@ def test_element_at_map_string_keys(data_gen): @pytest.mark.parametrize('data_gen', numeric_key_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_numeric_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -484,6 +497,7 @@ def test_element_at_map_numeric_keys(data_gen): [MapGen(DecimalGen(precision=35, scale=2, nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_element_at_map_dec_col_keys(data_gen): keys = DecimalGen(precision=35, scale=2) assert_gpu_and_cpu_are_equal_collect( @@ -509,6 +523,7 @@ def test_get_map_value_element_at_map_string_col_keys_ansi(data_gen, ansi): [MapGen(StringGen(pattern='key_[0-9]', nullable=False), value(), max_length=6) for value in get_map_value_gens(precision=37, scale=0)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_get_map_value_element_at_map_string_col_keys(data_gen): keys = StringGen(pattern='key_[0-9]') assert_gpu_and_cpu_are_equal_collect( @@ -565,6 +580,7 @@ def test_get_map_value_string_col_keys_ansi_null(data_gen): @pytest.mark.parametrize('data_gen', [MapGen(DateGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_date_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -578,6 +594,7 @@ def test_element_at_map_date_keys(data_gen): [MapGen(TimestampGen(nullable=False), value(), max_length=6) for value in get_map_value_gens()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_element_at_map_timestamp_keys(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( @@ -609,6 +626,7 @@ def test_map_element_at_ansi_null(data_gen): conf=ansi_enabled_conf) @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_transform_values(data_gen): def do_it(spark): columns = ['a', 'b', @@ -647,6 +665,7 @@ def do_it(spark): @pytest.mark.parametrize('data_gen', map_gens_sample + decimal_128_map_gens + decimal_64_map_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_transform_keys(data_gen): # The processing here is very limited, because we need to be sure we do not create duplicate keys. # This can happen because of integer overflow, round off errors in floating point, etc. So for now @@ -706,6 +725,7 @@ def test_sql_map_scalars(query): @pytest.mark.parametrize('data_gen', map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_map_filter(data_gen): columns = ['map_filter(a, (key, value) -> isnotnull(value) )', 'map_filter(a, (key, value) -> isnull(value) )', diff --git a/integration_tests/src/main/python/mortgage_test.py b/integration_tests/src/main/python/mortgage_test.py index aed9aa63c85..00bab066651 100644 --- a/integration_tests/src/main/python/mortgage_test.py +++ b/integration_tests/src/main/python/mortgage_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_iterator +from conftest import is_not_utc from marks import approximate_float, incompat, ignore_order, allow_non_gpu, limit @incompat @@ -22,6 +23,7 @@ @limit @ignore_order @allow_non_gpu(any=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_mortgage(mortgage): assert_gpu_and_cpu_are_equal_iterator( lambda spark : mortgage.do_test_query(spark)) diff --git a/integration_tests/src/main/python/orc_cast_test.py b/integration_tests/src/main/python/orc_cast_test.py index 45860d5b299..cccd60125b9 100644 --- a/integration_tests/src/main/python/orc_cast_test.py +++ b/integration_tests/src/main/python/orc_cast_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from spark_session import with_cpu_session @@ -49,6 +50,7 @@ def test_casting_among_integer_types(spark_tmp_path, reader_confs, v1_enabled_li @pytest.mark.parametrize('to_type', ['float', 'double', 'string', 'timestamp']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_integer(spark_tmp_path, to_type): orc_path = spark_tmp_path + '/orc_cast_integer' # The Python 'datetime' module only supports a max-year of 10000, so we set the Long type max @@ -70,6 +72,7 @@ def test_casting_from_integer(spark_tmp_path, to_type): @pytest.mark.parametrize('overflow_long_gen', [LongGen(min_val=int(1e16)), LongGen(max_val=int(-1e16))]) @pytest.mark.parametrize('to_type', ['timestamp']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_overflow_long(spark_tmp_path, overflow_long_gen,to_type): # Timestamp(micro-seconds) is actually type of int64, when casting long(int64) to timestamp, # we need to multiply 1e6 (or 1e3), and it may cause overflow. This function aims to test @@ -100,6 +103,7 @@ def test_casting_from_float_and_double(spark_tmp_path, to_type): @pytest.mark.parametrize('data_gen', [DoubleGen(max_exp=32, special_cases=None), DoubleGen(max_exp=32, special_cases=[8.88e9, 9.99e10, 1.314e11])]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): # ORC will assume the original double value in seconds, we need to convert them to # timestamp(INT64 in micro-seconds). @@ -123,6 +127,7 @@ def test_casting_from_double_to_timestamp(spark_tmp_path, data_gen): ) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_casting_from_overflow_double_to_timestamp(spark_tmp_path): orc_path = spark_tmp_path + '/orc_casting_from_overflow_double_to_timestamp' with_cpu_session( diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 337c4de9815..409d0850987 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -15,6 +15,7 @@ import pytest from asserts import * +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -67,6 +68,7 @@ def get_orc_timestamp_gen(nullable=True): @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, reader_confs): all_confs = copy_and_update(reader_confs, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, @@ -158,6 +160,7 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf): @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -183,6 +186,7 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pred_push_round_trip(spark_tmp_path, orc_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/ORC_DATA' # Append two struct columns to verify nested predicate pushdown. @@ -239,6 +243,7 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -305,6 +310,7 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators @@ -583,6 +589,7 @@ def test_read_struct_without_stream(spark_tmp_path): @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('case_sensitive', ["false", "true"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enabled_list, case_sensitive): struct_gen = StructGen([('nested_col', orc_gen)]) # Map is not supported yet. @@ -770,6 +777,7 @@ def test_orc_read_varchar_as_string(std_input_path): @pytest.mark.parametrize('gens', orc_gens_list, ids=idfn) @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_order): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -784,6 +792,7 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index cee10b9ce4e..5617f8e20e5 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -16,6 +16,7 @@ from asserts import assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write from spark_session import is_before_spark_320, is_spark_321cdh, is_spark_cdh, with_cpu_session, with_gpu_session +from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * from marks import * @@ -80,6 +81,7 @@ @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -114,6 +116,7 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): # There are race conditions around when individual files are read in for partitioned data @ignore_order @pytest.mark.parametrize('orc_gen', orc_part_write_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_part_write_round_trip(spark_tmp_path, orc_gen): gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)] @@ -167,6 +170,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -189,6 +193,7 @@ def write_orc_sql_from(spark, df, data_path, write_to_table): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('ts_type', ["TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' @@ -200,6 +205,7 @@ def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') @pytest.mark.parametrize('codec', ['zlib', 'lzo']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orc_write_compression_fallback(spark_tmp_path, codec, spark_tmp_table_factory): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -256,6 +262,7 @@ def sql_write(spark, path): @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index e124ea7ee4e..f6cc2a0141b 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -16,6 +16,7 @@ import pytest from asserts import * +from conftest import is_not_utc from data_gen import * from parquet_write_test import parquet_nested_datetime_gen, parquet_ts_write_options from marks import * @@ -163,6 +164,7 @@ def setup_table(spark): @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_round_trip(spark_tmp_path, parquet_gens, read_func, reader_confs, v1_enabled_list): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -297,6 +299,7 @@ def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_l @pytest.mark.parametrize('read_func', [read_parquet_df, read_parquet_sql]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('a', RepeatSeqGen(parquet_gen, 100)), ('b', parquet_gen)] @@ -316,6 +319,7 @@ def test_parquet_pred_push_round_trip(spark_tmp_path, parquet_gen, read_func, v1 @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, parquet_gens, ts_type, ts_rebase_write, ts_rebase_read, reader_confs, v1_enabled_list): @@ -355,6 +359,7 @@ def test_parquet_decimal_read_legacy(spark_tmp_path, parquet_gens, read_func, re @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize('batch_size', [100, INT_MAX]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs, batch_size): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -386,6 +391,7 @@ def test_parquet_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader # In this we are reading the data, but only reading the key the data was partitioned by @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reader_confs): parquet_gens = [byte_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -528,6 +534,7 @@ def read_and_remove(spark): @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -552,6 +559,7 @@ def test_parquet_read_merge_schema(spark_tmp_path, v1_enabled_list, reader_confs @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_read_merge_schema_from_conf(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/133 and https://github.com/NVIDIA/spark-rapids/issues/132 are fixed # we should go with a more standard set of generators @@ -867,6 +875,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters(spark_tmp_path, read @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' @@ -894,6 +903,7 @@ def test_parquet_reading_from_unaligned_pages_all_types(spark_tmp_path, reader_c @pytest.mark.parametrize('reader_confs', reader_opt_confs, ids=idfn) @pytest.mark.parametrize('enable_dictionary', ["true", "false"], ids=idfn) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_reading_from_unaligned_pages_all_types_dict_optimized(spark_tmp_path, reader_confs, enable_dictionary, v1_enabled_list): all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) data_path = spark_tmp_path + '/PARQUET_UNALIGNED_DATA' diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 411f8cdf153..c661270e159 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -15,6 +15,7 @@ import pytest from asserts import * +from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * from enum import Enum @@ -89,6 +90,7 @@ @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -134,6 +136,7 @@ def test_write_round_trip_corner(spark_tmp_path, par_gen): ArrayGen(TimestampGen(), max_length=10), MapGen(TimestampGen(nullable=False), TimestampGen())]], ids=idfn) @pytest.mark.parametrize('ts_type', parquet_ts_write_options) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -147,6 +150,7 @@ def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): @pytest.mark.parametrize('ts_type', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase', ['CORRECTED']) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -170,6 +174,7 @@ def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): @ignore_order @pytest.mark.order(1) # at the head of xdist worker queue if pytest-order is installed @pytest.mark.parametrize('parquet_gen', parquet_part_write_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] @@ -184,6 +189,7 @@ def test_part_write_round_trip(spark_tmp_path, parquet_gen): @pytest.mark.skipif(is_spark_340_or_later() or is_databricks122_or_later(), reason="`WriteFilesExec` is only supported in Spark 340+") @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) @pytest.mark.allow_non_gpu("DataWritingCommandExec") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_int96_write_conf(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { @@ -214,6 +220,7 @@ def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): ['DataWritingCommandExec', 'WriteFilesExec'], confs) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): @@ -243,6 +250,7 @@ def test_compress_write_round_trip(spark_tmp_path, compress): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -260,6 +268,7 @@ def write_parquet_sql_from(spark, df, data_path, write_to_table): @pytest.mark.order(2) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_sql_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' @@ -282,6 +291,7 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact ('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))), ('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))]) @pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' @@ -460,6 +470,7 @@ def generate_map_with_empty_validity(spark, path): @pytest.mark.parametrize('data_gen', parquet_nested_datetime_gen, ids=idfn) @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', ['EXCEPTION']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_write_fails_legacy_datetime(spark_tmp_path, data_gen, ts_write, ts_rebase_write): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, @@ -477,6 +488,7 @@ def writeParquetCatchException(spark, data_gen, data_path): @pytest.mark.parametrize('ts_write', parquet_ts_write_options) @pytest.mark.parametrize('ts_rebase_write', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) @pytest.mark.parametrize('ts_rebase_read', [('CORRECTED', 'LEGACY'), ('LEGACY', 'CORRECTED')]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, data_gen, ts_write, ts_rebase_write, ts_rebase_read): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -520,6 +532,7 @@ def test_it(spark): with_gpu_session(test_it, conf) @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] @@ -759,6 +772,7 @@ def read_table(spark, path): # Test to avoid regression on a known bug in Spark. For details please visit https://github.com/NVIDIA/spark-rapids/issues/8693 @pytest.mark.parametrize('ts_rebase', ['LEGACY', 'CORRECTED']) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase): def func_test(create_table, read_table, data_path, conf): assert_gpu_and_cpu_writes_are_equal_collect(create_table, read_table, data_path, conf=conf) diff --git a/integration_tests/src/main/python/qa_nightly_select_test.py b/integration_tests/src/main/python/qa_nightly_select_test.py index ba3414e51fe..1349de3fcdf 100644 --- a/integration_tests/src/main/python/qa_nightly_select_test.py +++ b/integration_tests/src/main/python/qa_nightly_select_test.py @@ -16,6 +16,7 @@ from pyspark.sql.types import * from pyspark import SparkConf, SparkContext, SQLContext import pyspark.sql.functions as f +from conftest import is_not_utc import datetime from argparse import ArgumentParser from decimal import Decimal @@ -158,6 +159,7 @@ def idfn(val): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -170,6 +172,7 @@ def test_select(sql_query_line, pytestconfig): @incompat @qarun @pytest.mark.parametrize('sql_query_line', SELECT_NEEDS_SORT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_needs_sort_select(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -182,6 +185,7 @@ def test_needs_sort_select(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_JOIN_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_join(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -198,6 +202,7 @@ def init_tables(spark): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_PRE_ORDER_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_first_last(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -210,6 +215,7 @@ def test_select_first_last(sql_query_line, pytestconfig): @ignore_order(local=True) @qarun @pytest.mark.parametrize('sql_query_line', SELECT_FLOAT_SQL, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_float_order_local(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: @@ -224,6 +230,7 @@ def test_select_float_order_local(sql_query_line, pytestconfig): @qarun @pytest.mark.parametrize('sql_query_line', SELECT_REGEXP_SQL, ids=idfn) @pytest.mark.skipif(not is_jvm_charset_utf8(), reason="Regular expressions require UTF-8") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_select_regexp(sql_query_line, pytestconfig): sql_query = sql_query_line[0] if sql_query: diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py index d44280ada69..60e0a191f25 100644 --- a/integration_tests/src/main/python/repart_test.py +++ b/integration_tests/src/main/python/repart_test.py @@ -16,6 +16,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect from spark_session import is_before_spark_320, is_before_spark_330 +from conftest import is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu import pyspark.sql.functions as f @@ -89,6 +90,7 @@ def test_union_struct_missing_children(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen))) @@ -99,6 +101,7 @@ def test_union(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unionAll(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen))) @@ -113,6 +116,7 @@ def test_unionAll(data_gen): struct_of_maps], ids=idfn) # This tests the union of two DFs of structs with missing child column names. The missing child # column will be replaced by nulls in the output DF. This is a feature added in 3.1+ +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_missing_col_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).withColumnRenamed("a", "x") @@ -154,6 +158,7 @@ def assert_union_equal(gen1, gen2): StructGen([['child0', DecimalGen(7, 2)]]), nested_struct, struct_of_maps], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_union_by_name(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen))) @@ -165,12 +170,14 @@ def test_union_by_name(data_gen): pytest.param([('array' + str(i), gen) for i, gen in enumerate(array_gens_sample + [ArrayGen(BinaryGen(max_length=5), max_length=5)])]), pytest.param([('map' + str(i), gen) for i, gen in enumerate(map_gens_sample)]), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce_types(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen).coalesce(2)) @pytest.mark.parametrize('num_parts', [1, 10, 100, 1000, 2000], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_coalesce_df(num_parts, length): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])] @@ -186,6 +193,7 @@ def test_coalesce_df(num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_repartition_df(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -202,6 +210,7 @@ def test_repartition_df(data_gen, num_parts, length): @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_repartition_df_for_round_robin(data_gen, num_parts, length): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( @@ -275,6 +284,7 @@ def test_hash_fallback(data_gen): ([('a', decimal_gen_64bit), ('b', decimal_gen_64bit), ('c', decimal_gen_64bit)], ['a', 'b', 'c']), ([('a', decimal_gen_128bit), ('b', decimal_gen_128bit), ('c', decimal_gen_128bit)], ['a', 'b', 'c']), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] diff --git a/integration_tests/src/main/python/row-based_udf_test.py b/integration_tests/src/main/python/row-based_udf_test.py index e849a87b10e..19b02f2e24e 100644 --- a/integration_tests/src/main/python/row-based_udf_test.py +++ b/integration_tests/src/main/python/row-based_udf_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from spark_session import with_spark_session, is_spark_350_or_later from conftest import skip_unless_precommit_tests @@ -33,6 +34,7 @@ def load_hive_udf(spark, udfname, udfclass): @pytest.mark.xfail(condition=is_spark_350_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/9064') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_simple_udf(): with_spark_session(skip_if_no_hive) @@ -46,6 +48,7 @@ def evalfn(spark): "SELECT i, emptysimple(s, 'const_string') FROM hive_simple_udf_test_table", conf={'spark.rapids.sql.rowBasedUDF.enabled': 'true'}) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_hive_empty_generic_udf(): with_spark_session(skip_if_no_hive) def evalfn(spark): diff --git a/integration_tests/src/main/python/row_conversion_test.py b/integration_tests/src/main/python/row_conversion_test.py index 92ea05d68be..bc13419c8ec 100644 --- a/integration_tests/src/main/python/row_conversion_test.py +++ b/integration_tests/src/main/python/row_conversion_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu, approximate_float, incompat from pyspark.sql.types import * @@ -28,6 +29,7 @@ # to be brought back to the CPU (rows) to be returned. # So we just need a very simple operation in the middle that # can be done on the GPU. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["g", string_gen], ["h", boolean_gen], @@ -42,6 +44,7 @@ def test_row_conversions(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width(): gens = [["a", byte_gen], ["b", short_gen], ["c", int_gen], ["d", long_gen], ["e", float_gen], ["f", double_gen], ["h", boolean_gen], @@ -50,6 +53,7 @@ def test_row_conversions_fixed_width(): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, gens).selectExpr("*", "a as a_again")) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_row_conversions_fixed_width_wide(): gens = [["a{}".format(i), ByteGen(nullable=True)] for i in range(10)] + \ [["b{}".format(i), ShortGen(nullable=True)] for i in range(10)] + \ diff --git a/integration_tests/src/main/python/sample_test.py b/integration_tests/src/main/python/sample_test.py index fc9d9fc4cbf..5ae72212bed 100644 --- a/integration_tests/src/main/python/sample_test.py +++ b/integration_tests/src/main/python/sample_test.py @@ -14,6 +14,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * from spark_session import is_before_spark_330 @@ -38,6 +39,7 @@ def test_sample_produce_empty_batch(data_gen): # the following cases is the normal cases and do not use @ignore_order nested_gens = array_gens_sample + struct_gens_sample + map_gens_sample @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10) @@ -45,6 +47,7 @@ def test_sample(data_gen): ) @pytest.mark.parametrize('data_gen', basic_gens + nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sample_with_replacement(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, num_slices = 10).sample( diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py index a457b082858..c14b7aeb29d 100644 --- a/integration_tests/src/main/python/schema_evolution_test.py +++ b/integration_tests/src/main/python/schema_evolution_test.py @@ -13,6 +13,7 @@ # limitations under the License. from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from datetime import datetime, timezone from marks import ignore_order @@ -58,6 +59,7 @@ def get_ddl(col_gen_pairs): @ignore_order(local=True) @pytest.mark.parametrize("format", _formats) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_column_add_after_partition(spark_tmp_table_factory, format): # Databricks 10.4 appears to be missing https://issues.apache.org/jira/browse/SPARK-39417 # so avoid generating nulls for numeric partitions diff --git a/integration_tests/src/main/python/sort_test.py b/integration_tests/src/main/python/sort_test.py index f3a73066af5..7fe208ae12d 100644 --- a/integration_tests/src/main/python/sort_test.py +++ b/integration_tests/src/main/python/sort_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import allow_non_gpu from pyspark.sql.types import * @@ -51,6 +52,7 @@ def test_sort_nonbinary_carry_binary(data_gen): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -58,6 +60,7 @@ def test_single_orderby(data_gen, order): @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_first(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first(), f.col('a').desc_nulls_last()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_on_array(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order)) @@ -102,6 +105,7 @@ def test_single_orderby_fallback_for_array_of_struct(data_gen, order): marks=pytest.mark.xfail(reason='opposite null order not supported')), pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_plain(data_gen, order, shuffle_parts, stable_sort): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order), @@ -129,6 +133,7 @@ def test_single_nested_orderby_fallback_for_nullorder(data_gen, order): orderable_without_neg_decimal = [n for n in (orderable_gens + orderable_not_null_gen) if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_without_neg_decimal + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100)) @@ -139,6 +144,7 @@ def test_single_orderby_with_limit(data_gen, order): pytest.param(f.col('a').desc(), all_basic_struct_gen), pytest.param(f.col('a').desc_nulls_last(), all_basic_struct_gen) ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100), @@ -161,6 +167,7 @@ def test_single_nested_orderby_with_limit_fallback(data_gen, order): @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen + single_level_array_gens, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_sort_in_part(data_gen, order): # We set `num_slices` to handle https://github.com/NVIDIA/spark-rapids/issues/2477 assert_gpu_and_cpu_are_equal_collect( @@ -183,6 +190,7 @@ def test_single_sort_in_part(data_gen, order): pytest.param(f.col('a').desc_nulls_last()), ], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_sort_in_part(data_gen, order, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} assert_gpu_and_cpu_are_equal_collect( @@ -193,11 +201,13 @@ def test_single_nested_sort_in_part(data_gen, order, stable_sort): boolean_gen, timestamp_gen, date_gen, string_gen, null_gen, StructGen([('child0', long_gen)]) ] + orderable_decimal_gens + single_level_array_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @pytest.mark.parametrize('data_gen', single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_on_array(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc())) @@ -205,6 +215,7 @@ def test_multi_orderby_on_array(data_gen): # SPARK CPU itself has issue with negative scale for take ordered and project orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -212,6 +223,7 @@ def test_multi_orderby_with_limit(data_gen): # We added in a partitioning optimization to take_ordered_and_project # This should trigger it. @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal + single_level_array_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_orderby_with_limit_single_part(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).coalesce(1).orderBy(f.col('a'), f.col('b').desc()).limit(100)) @@ -256,6 +268,7 @@ def test_single_orderby_with_skew(data_gen): # We are not trying all possibilities, just doing a few with numbers so the query works. @pytest.mark.parametrize('data_gen', [all_basic_struct_gen, StructGen([['child0', all_basic_struct_gen]])], ids=idfn) @pytest.mark.parametrize('stable_sort', ['STABLE', 'OUTOFCORE'], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_nested_orderby_with_skew(data_gen, stable_sort): sort_conf = {'spark.rapids.sql.stableSort.enabled': stable_sort == 'STABLE'} # When doing range partitioning the upstream data is sampled to try and get the bounds for cutoffs. @@ -299,6 +312,7 @@ def test_large_orderby(data_gen, stable_sort): simple_string_to_string_map_gen, ArrayGen(byte_gen, max_length=5)], ids=idfn) @pytest.mark.order(2) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_large_orderby_nested_ridealong(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. @@ -319,6 +333,7 @@ def test_large_orderby_nested_ridealong(data_gen): ArrayGen(byte_gen, max_length=5), ArrayGen(decimal_gen_128bit, max_length=5)], ids=idfn) @pytest.mark.order(2) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_orderby_nested_ridealong_limit(data_gen): # We use a UniqueLongGen to avoid duplicate keys that can cause ambiguity in the sort # results, especially on distributed clusters. diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py index 0e230a95408..986781c32e0 100644 --- a/integration_tests/src/main/python/struct_test.py +++ b/integration_tests/src/main/python/struct_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from pyspark.sql.types import * @@ -33,6 +34,7 @@ def test_struct_scalar_project(): StructGen([["first", decimal_gen_64bit], ["second", decimal_gen_32bit], ["third", decimal_gen_32bit]]), StructGen([["first", decimal_gen_128bit], ["second", decimal_gen_128bit], ["third", decimal_gen_128bit]]), StructGen([["first", binary_gen], ["second", ArrayGen(BinaryGen(max_length=10), max_length=10)], ["third", binary_gen]])], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_struct_get_item(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( @@ -43,6 +45,7 @@ def test_struct_get_item(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + [binary_gen, null_gen] + single_level_array_gens + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_make_struct(data_gen): # Spark has no good way to create a map literal without the map function # so we are inserting one. diff --git a/integration_tests/src/main/python/subquery_test.py b/integration_tests/src/main/python/subquery_test.py index e6d641d4212..25a70b47a17 100644 --- a/integration_tests/src/main/python/subquery_test.py +++ b/integration_tests/src/main/python/subquery_test.py @@ -14,11 +14,13 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_sql +from conftest import is_not_utc from data_gen import * from marks import * @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_basics(data_gen): # Fix num_slices at 1 to make sure that first/last returns same results under CPU and GPU. assert_gpu_and_cpu_are_equal_sql( @@ -31,6 +33,7 @@ def test_scalar_subquery_basics(data_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_struct(basic_gen): # single-level struct gen = [('ss', StructGen([['a', basic_gen], ['b', basic_gen]]))] @@ -65,6 +68,7 @@ def test_scalar_subquery_struct(basic_gen): @ignore_order(local=True) @pytest.mark.parametrize('basic_gen', all_basic_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_scalar_subquery_array(basic_gen): # single-level array assert_gpu_and_cpu_are_equal_sql( diff --git a/integration_tests/src/main/python/time_window_test.py b/integration_tests/src/main/python/time_window_test.py index ff367b506fb..52071926309 100644 --- a/integration_tests/src/main/python/time_window_test.py +++ b/integration_tests/src/main/python/time_window_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect +from conftest import is_not_utc from data_gen import * from datetime import datetime from marks import ignore_order, allow_non_gpu @@ -29,6 +30,7 @@ @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -40,6 +42,7 @@ def test_grouped_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -47,6 +50,7 @@ def test_grouped_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_grouped_sliding_window_array(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', ArrayGen(data_gen)]], nullable=False) assert_gpu_and_cpu_are_equal_collect( @@ -54,6 +58,7 @@ def test_grouped_sliding_window_array(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_tumbling_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour')) @@ -62,6 +67,7 @@ def test_tumbling_window(data_gen): @pytest.mark.parametrize('data_gen', integral_gens + [string_gen], ids=idfn) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour', '1 hour')) @@ -72,6 +78,7 @@ def test_sliding_window(data_gen): @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + array_gens_sample + map_gens_sample, ids=idfn) # This includes an expand and we produce a different order than the CPU does. Sort locally to allow sorting of all types @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_just_window(data_gen): row_gen = StructGen([['ts', timestamp_gen],['data', data_gen]], nullable=False) assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index 14fc57cf972..0597c0714c3 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -14,7 +14,7 @@ import pytest -from conftest import is_at_least_precommit_run +from conftest import is_at_least_precommit_run, is_not_utc from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_350_or_later from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version @@ -84,6 +84,7 @@ def iterator_add(to_process: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[ @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_scalar_udf_nested_type(data_gen): def nested_size(nested): return pd.Series([nested.size]).repeat(len(nested)) @@ -110,6 +111,7 @@ def pandas_sum(to_process: pd.Series) -> float: @approximate_float @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_single_aggregate_udf_more_types(data_gen): @f.pandas_udf('double') def group_size_udf(to_process: pd.Series) -> float: @@ -140,6 +142,7 @@ def pandas_sum(to_process: pd.Series) -> int: @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_aggregate_udf_more_types(data_gen): @f.pandas_udf('long') def group_size_udf(to_process: pd.Series) -> int: @@ -255,6 +258,7 @@ def pandas_add(data): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', arrow_common_gen, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_group_apply_udf_more_types(data_gen): def group_size_udf(key, pdf): return pd.DataFrame([[len(key), len(pdf), len(pdf.columns)]]) @@ -282,6 +286,7 @@ def pandas_filter(iterator): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', data_gens_nested_for_udf, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_pandas_map_udf_nested_type(data_gen): # Supported UDF output types by plugin: (commonCudfTypes + ARRAY).nested() + STRUCT # STRUCT represents the whole dataframe in Map Pandas UDF, so no struct column in UDF output. diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index e01c68ed35c..178900c99f6 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -15,6 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_are_equal_sql, assert_gpu_fallback_collect, assert_gpu_sql_fallback_collect +from conftest import is_not_utc from data_gen import * from marks import * from pyspark.sql.types import * @@ -450,6 +451,7 @@ def test_range_windows_with_string_order_by_column(data_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded_no_part(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -467,6 +469,7 @@ def test_window_batched_unbounded_no_part(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_batched_unbounded(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -487,6 +490,7 @@ def test_window_batched_unbounded(b_gen, batch_size): # the order returned should be consistent because the data ends up in a single task (no partitioning) @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen', all_basic_gens + [decimal_gen_32bit, decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_rows_based_running_window_unpartitioned(b_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.castFloatToDecimal.enabled': True} @@ -522,6 +526,7 @@ def test_rows_based_running_window_unpartitioned(b_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Testing multiple batch sizes. @pytest.mark.parametrize('a_gen', integral_gens + [string_gen, date_gen, timestamp_gen], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_running_window_without_partitions_runs_batched(a_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -645,6 +650,7 @@ def test_running_window_float_sum_without_partitions_runs_batched(batch_size): @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit, orderable_decimal_gen_128bit], ids=meta_idfn('data:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_running_rank_no_part(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -672,6 +678,7 @@ def test_window_running_rank_no_part(data_gen): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_32bit], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_running_rank(data_gen): # Keep the batch size small. We have tested these with operators with exact inputs already, this is mostly # testing the fixup operation. @@ -699,6 +706,7 @@ def test_window_running_rank(data_gen): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('b_gen, c_gen', [(long_gen, x) for x in running_part_and_order_gens] + [(x, long_gen) for x in all_basic_gens + [decimal_gen_32bit]], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size, 'spark.rapids.sql.variableFloatAgg.enabled': True, @@ -738,6 +746,7 @@ def test_rows_based_running_window_partitioned(b_gen, c_gen, batch_size): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # Test different batch sizes. @pytest.mark.parametrize('part_gen', [int_gen, long_gen], ids=idfn) # Partitioning is not really the focus of the test. @pytest.mark.parametrize('order_gen', [x for x in all_basic_gens_no_null if x not in boolean_gens] + [decimal_gen_32bit], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_range_running_window_runs_batched(part_gen, order_gen, batch_size): """ This tests the running window optimization as applied to RANGE-based window specifications, @@ -881,6 +890,7 @@ def window(oby_column): @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches @pytest.mark.parametrize('c_gen', lead_lag_data_gens, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_types_window_aggs_for_rows_lead_lag(a_b_gen, c_gen, batch_size): conf = {'spark.rapids.sql.batchSizeBytes': batch_size} data_gen = [ @@ -938,6 +948,7 @@ def do_it(spark): @approximate_float @pytest.mark.parametrize('struct_gen', lead_lag_struct_with_arrays_gen, ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_lead_lag_for_structs_with_arrays(a_b_gen, struct_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -971,6 +982,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1000,6 +1012,7 @@ def test_window_aggs_for_rows_lead_lag_on_arrays(a_gen, b_gen, c_gen, d_gen): @approximate_float @pytest.mark.parametrize('c_gen', [string_gen], ids=idfn) @pytest.mark.parametrize('a_b_gen', part_and_order_gens, ids=meta_idfn('partAndOrderBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_multi_types_window_aggs_for_rows(a_b_gen, c_gen): data_gen = [ ('a', RepeatSeqGen(a_b_gen, length=20)), @@ -1105,6 +1118,7 @@ def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('data_gen', [_grpkey_longs_with_timestamps, pytest.param(_grpkey_longs_with_nullable_timestamps)], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_ranges_timestamps(data_gen): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, data_gen, length=2048), @@ -1252,6 +1266,7 @@ def test_window_aggregations_for_big_decimal_ranges(data_gen): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_list(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1298,6 +1313,7 @@ def test_window_aggs_for_rows_collect_list(): @ignore_order(local=True) # This test is more directed at Databricks and their running window optimization instead of ours # this is why we do not validate that we inserted in a GpuRunningWindowExec, yet. +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_running_window_function_exec_for_all_aggs(): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, _gen_data_for_collect_list), @@ -1406,6 +1422,7 @@ def do_it(spark): # SortExec does not support array type, so sort the result locally. @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_set(): assert_gpu_and_cpu_are_equal_sql( lambda spark: gen_df(spark, _gen_data_for_collect_set), @@ -1467,6 +1484,7 @@ def test_window_aggs_for_rows_collect_set(): # and https://github.com/rapidsai/cudf/issues/11222 @ignore_order(local=True) @allow_non_gpu("ProjectExec", "SortArray") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_for_rows_collect_set_nested_array(): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", @@ -1579,6 +1597,7 @@ def do_it(spark): # but small batch sizes can make sort very slow, so do the final order by locally @ignore_order(local=True) @pytest.mark.parametrize('ride_along', all_basic_gens + decimal_gens + array_gens_sample + struct_gens_sample + map_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_ride_along(ride_along): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, [('a', UniqueLongGen()), ('b', ride_along)]), @@ -1654,6 +1673,7 @@ def test_unbounded_to_unbounded_window(): 'last(a) IGNORE NULLS OVER (PARTITION BY b ORDER BY c) ' @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_first_last_nth(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic @@ -1674,6 +1694,7 @@ def test_window_first_last_nth_ignore_nulls(data_gen): @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_to_date_with_window_functions(): """ This test ensures that date expressions participating alongside window aggregations From 9530b23bb70ff12c5381c6c3a96129912a0e096f Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 17 Nov 2023 21:54:39 +0800 Subject: [PATCH 03/20] Temporarily testing non-UTC test cases becasue of non-UTC TZ pipeline is not ready --- integration_tests/run_pyspark_from_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index f6e32c72161..0a5a6557940 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -224,7 +224,7 @@ else fi # time zone will be tested; use export TZ=time_zone_name before run this script - TZ=${TZ:-UTC} + export TZ=Iran # Set the Delta log cache size to prevent the driver from caching every Delta log indefinitely export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=$TZ -Ddelta.log.cacheSize=10 $COVERAGE_SUBMIT_FLAGS" From 3f8bc40fc0ac37edd1dd4ec5aae7e48992c33ced Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 20 Nov 2023 16:22:15 +0800 Subject: [PATCH 04/20] Xfail more cases that involve timestamp type --- integration_tests/src/main/python/date_time_test.py | 1 + integration_tests/src/main/python/parquet_testing_test.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 971ee0c9b05..5e891bea526 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -372,6 +372,7 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index 642d99c8f0b..efa4b62ca61 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -16,7 +16,7 @@ # https://github.com/apache/parquet-testing from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error -from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run +from conftest import get_std_input_path, is_not_utc, is_parquet_testing_tests_forced, is_precommit_run from data_gen import copy_and_update from pathlib import Path import pytest @@ -122,6 +122,7 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) From c2b5ffb8a66fa47f778ca397c8aa0ae1b2f87ba7 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 20 Nov 2023 18:11:58 +0800 Subject: [PATCH 05/20] Xfail Databricks cases because its default rebase mode is legacy --- integration_tests/src/main/python/hive_write_test.py | 2 ++ integration_tests/src/main/python/parquet_test.py | 2 ++ .../src/main/python/parquet_write_test.py | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 7bc5ceede85..c6b9e7b7d8a 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -153,9 +153,11 @@ def test_optimized_hive_bucketed_fallback(gens, storage, planned_write, spark_tm "ExecutedCommandExec", {"spark.sql.optimizer.plannedWrite.enabled": planned_write}) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_ints_to_long(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, int_gen, "INT", "BIGINT") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_longs_to_float(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, long_gen, "BIGINT", "FLOAT") diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index f6cc2a0141b..97efbf83049 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -957,6 +957,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters_with_nulls(spark_tmp } @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on Parquet is a new feature of Spark 330') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): """ No aggregation will be pushed down in this test, so we should not fallback to CPU @@ -1232,6 +1233,7 @@ def test_parquet_read_daytime_interval_cpu_file(spark_tmp_path): lambda spark: spark.read.parquet(data_path)) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index c661270e159..89421c2bb72 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -123,6 +123,7 @@ def test_write_round_trip(spark_tmp_path, parquet_gens): all_empty_map_gen] @pytest.mark.parametrize('par_gen', par_write_odd_empty_strings_gens_sample, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_round_trip_corner(spark_tmp_path, par_gen): gen_list = [('_c0', par_gen)] data_path = spark_tmp_path + '/PAR_DATA' @@ -239,6 +240,7 @@ def start(self, rand): parquet_write_compress_options.append('zstd') @pytest.mark.parametrize('compress', parquet_write_compress_options) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} @@ -313,6 +315,7 @@ def writeParquetNoOverwriteCatchException(spark, df, data_path, table_name): df.coalesce(1).write.format("parquet").option("path", data_path).saveAsTable(table_name) assert e_info.match(r".*already exists.*") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory): gen = IntegerGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -451,6 +454,7 @@ def sql_write(spark, path): # This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls. # There is no straight forward to do it besides creating a vector with nulls and then dropping nulls # cudf will create a vector with a null_mask even though we have just filtered them +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_map_nullable(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -509,6 +513,7 @@ def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, dat @pytest.mark.allow_non_gpu(*test_non_empty_ctas_non_gpu_execs) @pytest.mark.parametrize('allow_non_empty', [True, False]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty): data_path = spark_tmp_path + "/CTAS" conf = { @@ -556,6 +561,7 @@ def get_nested_parquet_meta_data_for_field_id(): @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -573,6 +579,7 @@ def test_parquet_write_field_id(spark_tmp_path): conf=enable_parquet_field_id_read) @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id_disabled(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -602,6 +609,7 @@ def test_write_daytime_interval(spark_tmp_path): @ignore_order @pytest.mark.skipif(is_before_spark_320(), reason="is only supported in Spark 320+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_concurrent_writer(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( @@ -722,6 +730,7 @@ def write_partitions(spark, table_name): @ignore_order(local=True) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_dynamic_partitioned_parquet_write(spark_tmp_table_factory, spark_tmp_path): def create_input_table(spark): @@ -807,6 +816,7 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, # Issue to test a known bug https://github.com/NVIDIA/spark-rapids/issues/8694 to avoid regression @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_list_struct_single_element(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' data_gen = ArrayGen(StructGen([('element', long_gen)], nullable=False), max_length=10, nullable=False) @@ -818,6 +828,7 @@ def test_write_list_struct_single_element(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(cpu_path), conf) @ignore_order +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_column_name_with_dots(spark_tmp_path): data_path = spark_tmp_path + "/PARQUET_DATA" gens = [ From 57a14769bf392f9b8406e367519e8ffb25c33a9d Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 20 Nov 2023 18:39:27 +0800 Subject: [PATCH 06/20] Xfail Databricks cases because its default rebase mode is legacy --- .../src/main/python/hive_write_test.py | 6 ++--- .../src/main/python/parquet_test.py | 6 ++--- .../src/main/python/parquet_write_test.py | 22 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index c6b9e7b7d8a..99a32cecd79 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import spark_jvm, is_not_utc +from conftest import is_databricks_runtime, is_not_utc, spark_jvm from data_gen import * from datetime import date, datetime, timezone from marks import * @@ -153,11 +153,11 @@ def test_optimized_hive_bucketed_fallback(gens, storage, planned_write, spark_tm "ExecutedCommandExec", {"spark.sql.optimizer.plannedWrite.enabled": planned_write}) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_ints_to_long(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, int_gen, "INT", "BIGINT") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_longs_to_float(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, long_gen, "BIGINT", "FLOAT") diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 97efbf83049..042b10991d8 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -16,7 +16,7 @@ import pytest from asserts import * -from conftest import is_not_utc +from conftest import is_databricks_runtime, is_not_utc from data_gen import * from parquet_write_test import parquet_nested_datetime_gen, parquet_ts_write_options from marks import * @@ -957,7 +957,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters_with_nulls(spark_tmp } @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on Parquet is a new feature of Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): """ No aggregation will be pushed down in this test, so we should not fallback to CPU @@ -1233,7 +1233,7 @@ def test_parquet_read_daytime_interval_cpu_file(spark_tmp_path): lambda spark: spark.read.parquet(data_path)) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 89421c2bb72..2d21ff0b4a0 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -123,7 +123,7 @@ def test_write_round_trip(spark_tmp_path, parquet_gens): all_empty_map_gen] @pytest.mark.parametrize('par_gen', par_write_odd_empty_strings_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_round_trip_corner(spark_tmp_path, par_gen): gen_list = [('_c0', par_gen)] data_path = spark_tmp_path + '/PAR_DATA' @@ -240,7 +240,7 @@ def start(self, rand): parquet_write_compress_options.append('zstd') @pytest.mark.parametrize('compress', parquet_write_compress_options) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} @@ -315,7 +315,7 @@ def writeParquetNoOverwriteCatchException(spark, df, data_path, table_name): df.coalesce(1).write.format("parquet").option("path", data_path).saveAsTable(table_name) assert e_info.match(r".*already exists.*") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory): gen = IntegerGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -454,7 +454,7 @@ def sql_write(spark, path): # This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls. # There is no straight forward to do it besides creating a vector with nulls and then dropping nulls # cudf will create a vector with a null_mask even though we have just filtered them -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_map_nullable(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -513,7 +513,7 @@ def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, dat @pytest.mark.allow_non_gpu(*test_non_empty_ctas_non_gpu_execs) @pytest.mark.parametrize('allow_non_empty', [True, False]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty): data_path = spark_tmp_path + "/CTAS" conf = { @@ -561,7 +561,7 @@ def get_nested_parquet_meta_data_for_field_id(): @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -579,7 +579,7 @@ def test_parquet_write_field_id(spark_tmp_path): conf=enable_parquet_field_id_read) @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id_disabled(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -609,7 +609,7 @@ def test_write_daytime_interval(spark_tmp_path): @ignore_order @pytest.mark.skipif(is_before_spark_320(), reason="is only supported in Spark 320+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_concurrent_writer(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( @@ -730,7 +730,7 @@ def write_partitions(spark, table_name): @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_dynamic_partitioned_parquet_write(spark_tmp_table_factory, spark_tmp_path): def create_input_table(spark): @@ -816,7 +816,7 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, # Issue to test a known bug https://github.com/NVIDIA/spark-rapids/issues/8694 to avoid regression @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_list_struct_single_element(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' data_gen = ArrayGen(StructGen([('element', long_gen)], nullable=False), max_length=10, nullable=False) @@ -828,7 +828,7 @@ def test_write_list_struct_single_element(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(cpu_path), conf) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_column_name_with_dots(spark_tmp_path): data_path = spark_tmp_path + "/PARQUET_DATA" gens = [ From 34cbc5d63daad1fcf6b38e1616c48f5dd8cefd8a Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 20 Nov 2023 18:45:03 +0800 Subject: [PATCH 07/20] Xfail Databricks cases because its default rebase mode is legacy --- integration_tests/src/main/python/parquet_write_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 2d21ff0b4a0..2e8fc746d89 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import is_not_utc +from conftest import is_databricks_runtime, is_not_utc from datetime import date, datetime, timezone from data_gen import * from enum import Enum From 4a4d249e5aa397533f8f81a803e0edcdbf5b7801 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 21 Nov 2023 12:03:11 +0800 Subject: [PATCH 08/20] Xfail failed cases for Spark 341 --- integration_tests/src/main/python/array_test.py | 4 ++++ integration_tests/src/main/python/cast_test.py | 5 +++++ integration_tests/src/main/python/conditionals_test.py | 1 + integration_tests/src/main/python/csv_test.py | 2 ++ integration_tests/src/main/python/date_time_test.py | 1 + .../src/main/python/fastparquet_compatibility_test.py | 2 ++ integration_tests/src/main/python/json_test.py | 2 ++ integration_tests/src/main/python/limit_test.py | 1 + integration_tests/src/main/python/parquet_write_test.py | 1 + integration_tests/src/main/python/window_function_test.py | 3 +++ 10 files changed, 22 insertions(+) diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 49e809f61be..29f4e64b893 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -486,6 +486,7 @@ def q1(spark): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') @pytest.mark.skipif(is_before_spark_313() or is_spark_330() or is_spark_330cdh(), reason="NaN equality is only handled in Spark 3.1.3+ and SPARK-39976 issue with null and ArrayIntersect in Spark 3.3.0") def test_array_intersect(data_gen): gen = StructGen( @@ -548,6 +549,7 @@ def test_array_intersect_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_union(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -589,6 +591,7 @@ def test_array_union_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_array_except(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), @@ -630,6 +633,7 @@ def test_array_except_before_spark313(data_gen): @incompat @pytest.mark.parametrize('data_gen', no_neg_zero_all_basic_gens + decimal_gens, ids=idfn) @pytest.mark.skipif(is_before_spark_313(), reason="NaN equality is only handled in Spark 3.1.3+") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_arrays_overlap(data_gen): gen = StructGen( [('a', ArrayGen(data_gen, nullable=True)), diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index d1fa25c60d2..7ee826abe97 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -404,6 +404,7 @@ def test_cast_string_to_negative_scale_decimal(): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float("inf"), float("-inf"), float("nan")]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_ansi_for_nan_inf(type, invalid_value): def fun(spark): data = [invalid_value] @@ -415,6 +416,7 @@ def fun(spark): @pytest.mark.skipif(is_before_spark_330(), reason="ansi cast throws exception only in 3.3.0+") @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @pytest.mark.parametrize('invalid_value', [float(LONG_MAX) + 100, float(LONG_MIN) - 100]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_ansi_overflow(type, invalid_value): def fun(spark): data = [invalid_value] @@ -423,6 +425,7 @@ def fun(spark): assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "ArithmeticException") @pytest.mark.skipif(is_before_spark_330(), reason='330+ throws exception in ANSI mode') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_float_to_timestamp_side_effect(): def getDf(spark): data = [(True, float(LONG_MAX) + 100), (False, float(1))] @@ -491,6 +494,7 @@ def test_cast_double_to_timestamp(ansi_enabled): (INT_MIN - 1, IntegerType()), ], ids=idfn) @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): (invalid, to_type) = invalid_and_type assert_gpu_and_cpu_error( @@ -501,6 +505,7 @@ def test_cast_timestamp_to_integral_ansi_overflow(invalid_and_type): error_message="overflow") @pytest.mark.skipif(is_before_spark_330(), reason="Spark 330- does not ansi casting between numeric and timestamp") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_timestamp_to_numeric_ansi_no_overflow(): data = [datetime.fromtimestamp(i) for i in range(BYTE_MIN, BYTE_MAX + 1)] assert_gpu_and_cpu_are_equal_collect( diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index de9c50546a2..48d5a05c099 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -252,6 +252,7 @@ def test_conditional_with_side_effects_sequence(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='Earlier versions of Spark cannot cast sequence to string') @pytest.mark.parametrize('data_gen', [mk_str_gen('[a-z]{0,3}')], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_conditional_with_side_effects_sequence_cast(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 74dab108c25..c10221a4407 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -570,6 +570,7 @@ def test_csv_read_count(spark_tmp_path): @pytest.mark.parametrize("timestamp_type", [ pytest.param('TIMESTAMP_LTZ', marks=pytest.mark.xfail(is_spark_350_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/9325")), "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec') @@ -622,6 +623,7 @@ def do_read(spark): @allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec') @pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_csv_prefer_date_with_infer_schema(spark_tmp_path): # start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606 data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))] diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 5e891bea526..7707bdbfde8 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -45,6 +45,7 @@ def test_timeadd(data_gen): .selectExpr("a + (interval {} days {} seconds)".format(days, seconds))) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_timeadd_daytime_column(): gen_list = [ # timestamp column max year is 1000 diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index 2bf23e6b9a2..9e9383898b3 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -135,6 +135,7 @@ def read_with_fastparquet_or_plugin(spark): StructGen(children=[("first", IntegerGen(nullable=False)), ("second", FloatGen(nullable=False))], nullable=False) ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): """ This test writes data_gen output to Parquet via Apache Spark, then verifies that fastparquet and the RAPIDS @@ -197,6 +198,7 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path): end=pandas_min_datetime), marks=pytest.mark.xfail(reason="fastparquet reads timestamps preceding 1900 incorrectly.")), ], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_reading_file_written_with_gpu(spark_tmp_path, column_gen): """ This test writes the data-gen output to file via the RAPIDS plugin, then checks that the data is read identically diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 3d0c50401ba..5d576d4b786 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -208,6 +208,7 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'json', 'FileSourceScanExec') @@ -216,6 +217,7 @@ def test_json_ts_formats_round_trip_ntz_v1(spark_tmp_path, date_format, ts_part, @pytest.mark.parametrize('ts_part', json_supported_ts_parts) @pytest.mark.parametrize('date_format', json_supported_date_formats) @pytest.mark.parametrize("timestamp_type", ["TIMESTAMP_LTZ", "TIMESTAMP_NTZ"]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_json_ts_formats_round_trip_ntz_v2(spark_tmp_path, date_format, ts_part, timestamp_type): json_ts_formats_round_trip_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, '', 'BatchScanExec') diff --git a/integration_tests/src/main/python/limit_test.py b/integration_tests/src/main/python/limit_test.py index 369c4dc2ab1..efe81c1058a 100644 --- a/integration_tests/src/main/python/limit_test.py +++ b/integration_tests/src/main/python/limit_test.py @@ -82,6 +82,7 @@ def test_non_zero_offset_with_limit(limit, offset, batch_size): @pytest.mark.skipif(is_before_spark_340(), reason='offset is introduced from Spark 3.4.0') @allow_non_gpu('ShuffleExchangeExec') # when limit = 0, ShuffleExchangeExec is not replaced. @approximate_float +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_order_by_offset_with_limit(limit, offset, data_gen, batch_size): # In CPU version of spark, (limit, offset) can not be negative number. # Test case description: diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 2e8fc746d89..d7cfc980fd4 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -208,6 +208,7 @@ def test_int96_write_conf(spark_tmp_path, data_gen): @pytest.mark.parametrize('data_gen', [TimestampGen()], ids=idfn) # Note: From Spark 340, WriteFilesExec is introduced. @pytest.mark.allow_non_gpu("DataWritingCommandExec", "WriteFilesExec") +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_int96_write_conf_with_write_exec(spark_tmp_path, data_gen): data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, { diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 178900c99f6..d850403d118 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -1070,6 +1070,7 @@ def do_it(spark): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1094,6 +1095,7 @@ def test_window_aggs_lead_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): @pytest.mark.parametrize('c_gen', [UniqueLongGen()], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('b_gen', [long_gen], ids=meta_idfn('orderBy:')) @pytest.mark.parametrize('a_gen', [long_gen], ids=meta_idfn('partBy:')) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_aggs_lag_ignore_nulls_fallback(a_gen, b_gen, c_gen, d_gen): data_gen = [ ('a', RepeatSeqGen(a_gen, length=20)), @@ -1684,6 +1686,7 @@ def test_window_first_last_nth(data_gen): @pytest.mark.skipif(is_before_spark_320(), reason='IGNORE NULLS clause is not supported for FIRST(), LAST() and NTH_VALUE in Spark 3.1.x') @pytest.mark.parametrize('data_gen', all_basic_gens_no_null + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_window_first_last_nth_ignore_nulls(data_gen): assert_gpu_and_cpu_are_equal_sql( # Coalesce is to make sure that first and last, which are non-deterministic become deterministic From 879927413b22676b4965457c32fb1132e8b2e15e Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 16:39:37 +0800 Subject: [PATCH 09/20] Revert "Xfail Databricks cases because its default rebase mode is legacy" This reverts commit 34cbc5d63daad1fcf6b38e1616c48f5dd8cefd8a. --- integration_tests/src/main/python/parquet_write_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 43cb7aa7e2e..67190733026 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import is_databricks_runtime, is_not_utc +from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * from enum import Enum From 12a88a8002a8db255dcdfbc368419cec7a123856 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 16:39:40 +0800 Subject: [PATCH 10/20] Revert "Xfail Databricks cases because its default rebase mode is legacy" This reverts commit 57a14769bf392f9b8406e367519e8ffb25c33a9d. --- .../src/main/python/hive_write_test.py | 6 ++--- .../src/main/python/parquet_test.py | 6 ++--- .../src/main/python/parquet_write_test.py | 22 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 99a32cecd79..c6b9e7b7d8a 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import * -from conftest import is_databricks_runtime, is_not_utc, spark_jvm +from conftest import spark_jvm, is_not_utc from data_gen import * from datetime import date, datetime, timezone from marks import * @@ -153,11 +153,11 @@ def test_optimized_hive_bucketed_fallback(gens, storage, planned_write, spark_tm "ExecutedCommandExec", {"spark.sql.optimizer.plannedWrite.enabled": planned_write}) -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_ints_to_long(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, int_gen, "INT", "BIGINT") -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_longs_to_float(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, long_gen, "BIGINT", "FLOAT") diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 042b10991d8..97efbf83049 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -16,7 +16,7 @@ import pytest from asserts import * -from conftest import is_databricks_runtime, is_not_utc +from conftest import is_not_utc from data_gen import * from parquet_write_test import parquet_nested_datetime_gen, parquet_ts_write_options from marks import * @@ -957,7 +957,7 @@ def test_parquet_reading_from_unaligned_pages_basic_filters_with_nulls(spark_tmp } @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on Parquet is a new feature of Spark 330') -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): """ No aggregation will be pushed down in this test, so we should not fallback to CPU @@ -1233,7 +1233,7 @@ def test_parquet_read_daytime_interval_cpu_file(spark_tmp_path): lambda spark: spark.read.parquet(data_path)) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 67190733026..930fbd0d352 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -123,7 +123,7 @@ def test_write_round_trip(spark_tmp_path, parquet_gens): all_empty_map_gen] @pytest.mark.parametrize('par_gen', par_write_odd_empty_strings_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_round_trip_corner(spark_tmp_path, par_gen): gen_list = [('_c0', par_gen)] data_path = spark_tmp_path + '/PAR_DATA' @@ -241,7 +241,7 @@ def start(self, rand): parquet_write_compress_options.append('zstd') @pytest.mark.parametrize('compress', parquet_write_compress_options) -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} @@ -316,7 +316,7 @@ def writeParquetNoOverwriteCatchException(spark, df, data_path, table_name): df.coalesce(1).write.format("parquet").option("path", data_path).saveAsTable(table_name) assert e_info.match(r".*already exists.*") -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory): gen = IntegerGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -455,7 +455,7 @@ def sql_write(spark, path): # This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls. # There is no straight forward to do it besides creating a vector with nulls and then dropping nulls # cudf will create a vector with a null_mask even though we have just filtered them -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_map_nullable(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -514,7 +514,7 @@ def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, dat @pytest.mark.allow_non_gpu(*test_non_empty_ctas_non_gpu_execs) @pytest.mark.parametrize('allow_non_empty', [True, False]) -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty): data_path = spark_tmp_path + "/CTAS" conf = { @@ -562,7 +562,7 @@ def get_nested_parquet_meta_data_for_field_id(): @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -580,7 +580,7 @@ def test_parquet_write_field_id(spark_tmp_path): conf=enable_parquet_field_id_read) @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id_disabled(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -610,7 +610,7 @@ def test_write_daytime_interval(spark_tmp_path): @ignore_order @pytest.mark.skipif(is_before_spark_320(), reason="is only supported in Spark 320+") -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_concurrent_writer(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( @@ -731,7 +731,7 @@ def write_partitions(spark, table_name): @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_dynamic_partitioned_parquet_write(spark_tmp_table_factory, spark_tmp_path): def create_input_table(spark): @@ -817,7 +817,7 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, # Issue to test a known bug https://github.com/NVIDIA/spark-rapids/issues/8694 to avoid regression @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec") -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_list_struct_single_element(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' data_gen = ArrayGen(StructGen([('element', long_gen)], nullable=False), max_length=10, nullable=False) @@ -829,7 +829,7 @@ def test_write_list_struct_single_element(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(cpu_path), conf) @ignore_order -@pytest.mark.xfail(condition = is_not_utc() and is_databricks_runtime(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') +@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_column_name_with_dots(spark_tmp_path): data_path = spark_tmp_path + "/PARQUET_DATA" gens = [ From 63362c7abb5546b1cfcb285b5e95ea9eca977666 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 16:39:42 +0800 Subject: [PATCH 11/20] Revert "Xfail Databricks cases because its default rebase mode is legacy" This reverts commit c2b5ffb8a66fa47f778ca397c8aa0ae1b2f87ba7. --- integration_tests/src/main/python/hive_write_test.py | 2 -- integration_tests/src/main/python/parquet_test.py | 2 -- .../src/main/python/parquet_write_test.py | 11 ----------- 3 files changed, 15 deletions(-) diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index c6b9e7b7d8a..7bc5ceede85 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -153,11 +153,9 @@ def test_optimized_hive_bucketed_fallback(gens, storage, planned_write, spark_tm "ExecutedCommandExec", {"spark.sql.optimizer.plannedWrite.enabled": planned_write}) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_ints_to_long(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, int_gen, "INT", "BIGINT") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_hive_copy_longs_to_float(spark_tmp_table_factory): do_hive_copy(spark_tmp_table_factory, long_gen, "BIGINT", "FLOAT") diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index 97efbf83049..f6cc2a0141b 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -957,7 +957,6 @@ def test_parquet_reading_from_unaligned_pages_basic_filters_with_nulls(spark_tmp } @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on Parquet is a new feature of Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): """ No aggregation will be pushed down in this test, so we should not fallback to CPU @@ -1233,7 +1232,6 @@ def test_parquet_read_daytime_interval_cpu_file(spark_tmp_path): lambda spark: spark.read.parquet(data_path)) @pytest.mark.skipif(is_before_spark_330(), reason='DayTimeInterval is not supported before Pyspark 3.3.0') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_read_daytime_interval_gpu_file(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('_c1', DayTimeIntervalGen())] diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 930fbd0d352..1db3a16c39d 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -123,7 +123,6 @@ def test_write_round_trip(spark_tmp_path, parquet_gens): all_empty_map_gen] @pytest.mark.parametrize('par_gen', par_write_odd_empty_strings_gens_sample, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_round_trip_corner(spark_tmp_path, par_gen): gen_list = [('_c0', par_gen)] data_path = spark_tmp_path + '/PAR_DATA' @@ -241,7 +240,6 @@ def start(self, rand): parquet_write_compress_options.append('zstd') @pytest.mark.parametrize('compress', parquet_write_compress_options) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} @@ -316,7 +314,6 @@ def writeParquetNoOverwriteCatchException(spark, df, data_path, table_name): df.coalesce(1).write.format("parquet").option("path", data_path).saveAsTable(table_name) assert e_info.match(r".*already exists.*") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_ts_write_twice_fails_exception(spark_tmp_path, spark_tmp_table_factory): gen = IntegerGen() data_path = spark_tmp_path + '/PARQUET_DATA' @@ -455,7 +452,6 @@ def sql_write(spark, path): # This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls. # There is no straight forward to do it besides creating a vector with nulls and then dropping nulls # cudf will create a vector with a null_mask even though we have just filtered them -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_map_nullable(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' @@ -514,7 +510,6 @@ def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, dat @pytest.mark.allow_non_gpu(*test_non_empty_ctas_non_gpu_execs) @pytest.mark.parametrize('allow_non_empty', [True, False]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_non_empty_ctas(spark_tmp_path, spark_tmp_table_factory, allow_non_empty): data_path = spark_tmp_path + "/CTAS" conf = { @@ -562,7 +557,6 @@ def get_nested_parquet_meta_data_for_field_id(): @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -580,7 +574,6 @@ def test_parquet_write_field_id(spark_tmp_path): conf=enable_parquet_field_id_read) @pytest.mark.skipif(is_before_spark_330(), reason='Field ID is not supported before Spark 330') -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_field_id_disabled(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' schema, data = get_nested_parquet_meta_data_for_field_id() @@ -610,7 +603,6 @@ def test_write_daytime_interval(spark_tmp_path): @ignore_order @pytest.mark.skipif(is_before_spark_320(), reason="is only supported in Spark 320+") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_concurrent_writer(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( @@ -731,7 +723,6 @@ def write_partitions(spark, table_name): @ignore_order(local=True) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_dynamic_partitioned_parquet_write(spark_tmp_table_factory, spark_tmp_path): def create_input_table(spark): @@ -817,7 +808,6 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled, # Issue to test a known bug https://github.com/NVIDIA/spark-rapids/issues/8694 to avoid regression @ignore_order @allow_non_gpu("SortExec", "ShuffleExchangeExec") -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_write_list_struct_single_element(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' data_gen = ArrayGen(StructGen([('element', long_gen)], nullable=False), max_length=10, nullable=False) @@ -829,7 +819,6 @@ def test_write_list_struct_single_element(spark_tmp_path): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(cpu_path), conf) @ignore_order -@pytest.mark.xfail(condition = is_not_utc(), reason = 'DB rebase mode is legacy: https://github.com/NVIDIA/spark-rapids/issues/9792') def test_parquet_write_column_name_with_dots(spark_tmp_path): data_path = spark_tmp_path + "/PARQUET_DATA" gens = [ From c54d064dbc215b3bb91b290ece7291ab7f27aedd Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 16:39:43 +0800 Subject: [PATCH 12/20] Revert "Xfail more cases that involve timestamp type" This reverts commit 3f8bc40fc0ac37edd1dd4ec5aae7e48992c33ced. --- integration_tests/src/main/python/date_time_test.py | 1 - integration_tests/src/main/python/parquet_testing_test.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index befecb2071f..423617fafce 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -381,7 +381,6 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index efa4b62ca61..642d99c8f0b 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -16,7 +16,7 @@ # https://github.com/apache/parquet-testing from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error -from conftest import get_std_input_path, is_not_utc, is_parquet_testing_tests_forced, is_precommit_run +from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run from data_gen import copy_and_update from pathlib import Path import pytest @@ -122,7 +122,6 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) From bfc8b387a9299911f3bc847415a0e93df6084b80 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 17:12:12 +0800 Subject: [PATCH 13/20] Revert "Temporarily testing non-UTC test cases becasue of non-UTC TZ pipeline is not ready" This reverts commit 9530b23bb70ff12c5381c6c3a96129912a0e096f. --- integration_tests/run_pyspark_from_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 0a5a6557940..f6e32c72161 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -224,7 +224,7 @@ else fi # time zone will be tested; use export TZ=time_zone_name before run this script - export TZ=Iran + TZ=${TZ:-UTC} # Set the Delta log cache size to prevent the driver from caching every Delta log indefinitely export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=$TZ -Ddelta.log.cacheSize=10 $COVERAGE_SUBMIT_FLAGS" From 498bca6ac6ab5eb0560c4670b133c7bae1bd2fb9 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 17:29:38 +0800 Subject: [PATCH 14/20] Temporarily testing non-UTC test cases --- jenkins/spark-premerge-build.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index a13b5137af0..4a26ac44034 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -154,7 +154,18 @@ ci_2() { # Download a Scala 2.12 build of spark prepare_spark $SPARK_VER 2.12 - ./integration_tests/run_pyspark_from_build.sh + + + # Temporarily test non-UTC TZ + # Only test the impacted 42 files + # Split multiple parts to test to avoid OOM: refer to: https://github.com/NVIDIA/spark-rapids/issues/9829 + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'aqe_test.py or arithmetic_ops_test.py or array_test.py or ast_test.py or cache_test.py or cast_test.py or cmp_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'collection_ops_test.py or conditionals_test.py or csv_test.py or datasourcev2_read_test.py or date_time_test.py or expand_exec_test.py or explain_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'fastparquet_compatibility_test.py or generate_expr_test.py or hash_aggregate_test.py or hashing_test.py or hive_delimited_text_test.py or hive_write_test.py or join_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'json_test.py or limit_test.py or map_test.py or mortgage_test.py or orc_cast_test.py or orc_test.py or orc_write_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'parquet_test.py or parquet_write_test.py or qa_nightly_select_test.py or repart_test.py or row-based_udf_test.py or row_conversion_test.py or sample_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'schema_evolution_test.py or sort_test.py or struct_test.py or subquery_test.py or time_window_test.py or udf_test.py or window_function_test.py' + # enable avro test separately INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled From 500d293234006b5c1c298b117d806116a45fdc07 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 17:36:36 +0800 Subject: [PATCH 15/20] Temporarily testing non-UTC test cases --- jenkins/spark-premerge-build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 4a26ac44034..4a3cc28181f 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -159,6 +159,7 @@ ci_2() { # Temporarily test non-UTC TZ # Only test the impacted 42 files # Split multiple parts to test to avoid OOM: refer to: https://github.com/NVIDIA/spark-rapids/issues/9829 + export TEST_PARALLEL=1 TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'aqe_test.py or arithmetic_ops_test.py or array_test.py or ast_test.py or cache_test.py or cast_test.py or cmp_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'collection_ops_test.py or conditionals_test.py or csv_test.py or datasourcev2_read_test.py or date_time_test.py or expand_exec_test.py or explain_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'fastparquet_compatibility_test.py or generate_expr_test.py or hash_aggregate_test.py or hashing_test.py or hive_delimited_text_test.py or hive_write_test.py or join_test.py' From d1a3e26b0e3e618fd735342aef8e2e409f4a732c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 23 Nov 2023 16:40:43 +0330 Subject: [PATCH 16/20] Fix --- integration_tests/src/main/python/date_time_test.py | 2 ++ integration_tests/src/main/python/json_test.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index 423617fafce..1d984193f9e 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -296,6 +296,7 @@ def test_from_utc_timestamp_unsupported_timezone_fallback(data_gen, time_zone): @pytest.mark.parametrize('time_zone', ["UTC", "Asia/Shanghai", "EST", "MST", "VST"], ids=idfn) @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_utc_timestamp_supported_timezones(data_gen, time_zone): # Remove spark.rapids.test.CPU.timezone configuration when GPU kernel is ready to really test on GPU assert_gpu_and_cpu_are_equal_collect( @@ -381,6 +382,7 @@ def test_unix_timestamp_improved(data_gen, ansi_enabled): @pytest.mark.parametrize('ansi_enabled', [True, False], ids=['ANSI_ON', 'ANSI_OFF']) @pytest.mark.parametrize('data_gen', date_n_time_gens, ids=idfn) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_unix_timestamp(data_gen, ansi_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.unix_timestamp(f.col("a"))), diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 79337033104..41571a203d5 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -560,6 +560,7 @@ def test_from_json_struct_decimal(): pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')), "CORRECTED" ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_date(date_gen, date_format, time_parser_policy): json_string_gen = StringGen(r'{ "a": ' + date_gen + ' }') \ .with_special_case('{ "a": null }') \ @@ -642,6 +643,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format "CORRECTED" ]) @pytest.mark.parametrize('ansi_enabled', [ True, False ]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled): json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \ .with_special_case('{ "a": null }') \ From 12a9e756aa00e51da2f4eb85c3435d8c4c704e04 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 24 Nov 2023 08:32:39 +0800 Subject: [PATCH 17/20] Restore TEST_PARALLEL from 1 to 5 becasue of running is slow --- jenkins/spark-premerge-build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 4a3cc28181f..4a26ac44034 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -159,7 +159,6 @@ ci_2() { # Temporarily test non-UTC TZ # Only test the impacted 42 files # Split multiple parts to test to avoid OOM: refer to: https://github.com/NVIDIA/spark-rapids/issues/9829 - export TEST_PARALLEL=1 TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'aqe_test.py or arithmetic_ops_test.py or array_test.py or ast_test.py or cache_test.py or cast_test.py or cmp_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'collection_ops_test.py or conditionals_test.py or csv_test.py or datasourcev2_read_test.py or date_time_test.py or expand_exec_test.py or explain_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'fastparquet_compatibility_test.py or generate_expr_test.py or hash_aggregate_test.py or hashing_test.py or hive_delimited_text_test.py or hive_write_test.py or join_test.py' From 6d21a9953f796809146c32478f92727a9dda2602 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 24 Nov 2023 08:49:36 +0800 Subject: [PATCH 18/20] Add one more file for non-UTC time zone --- integration_tests/src/main/python/parquet_testing_test.py | 1 + jenkins/spark-premerge-build.sh | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index 642d99c8f0b..3b5606efcc9 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -122,6 +122,7 @@ def gen_testing_params_for_valid_files(): @pytest.mark.parametrize("path", gen_testing_params_for_valid_files()) @pytest.mark.parametrize("confs", [_native_reader_confs, _java_reader_confs]) +@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_parquet_testing_valid_files(path, confs): assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(path), conf=confs) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 4a26ac44034..3ebf720a411 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -157,14 +157,14 @@ ci_2() { # Temporarily test non-UTC TZ - # Only test the impacted 42 files + # Only test the impacted 43 files # Split multiple parts to test to avoid OOM: refer to: https://github.com/NVIDIA/spark-rapids/issues/9829 TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'aqe_test.py or arithmetic_ops_test.py or array_test.py or ast_test.py or cache_test.py or cast_test.py or cmp_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'collection_ops_test.py or conditionals_test.py or csv_test.py or datasourcev2_read_test.py or date_time_test.py or expand_exec_test.py or explain_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'fastparquet_compatibility_test.py or generate_expr_test.py or hash_aggregate_test.py or hashing_test.py or hive_delimited_text_test.py or hive_write_test.py or join_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'json_test.py or limit_test.py or map_test.py or mortgage_test.py or orc_cast_test.py or orc_test.py or orc_write_test.py' TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'parquet_test.py or parquet_write_test.py or qa_nightly_select_test.py or repart_test.py or row-based_udf_test.py or row_conversion_test.py or sample_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'schema_evolution_test.py or sort_test.py or struct_test.py or subquery_test.py or time_window_test.py or udf_test.py or window_function_test.py' + TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'schema_evolution_test.py or sort_test.py or struct_test.py or subquery_test.py or time_window_test.py or udf_test.py or window_function_test.py or parquet_testing_test.py' # enable avro test separately INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh From 1311ffeb1e5362b9a3b42131e512b5788cbf9a59 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 24 Nov 2023 09:33:04 +0800 Subject: [PATCH 19/20] Fix import error --- integration_tests/src/main/python/parquet_testing_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/parquet_testing_test.py b/integration_tests/src/main/python/parquet_testing_test.py index 3b5606efcc9..a4600de7b86 100644 --- a/integration_tests/src/main/python/parquet_testing_test.py +++ b/integration_tests/src/main/python/parquet_testing_test.py @@ -16,7 +16,7 @@ # https://github.com/apache/parquet-testing from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error -from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run +from conftest import get_std_input_path, is_parquet_testing_tests_forced, is_precommit_run, is_not_utc from data_gen import copy_and_update from pathlib import Path import pytest From 6a02eb7e705d82422056e1b80e57e736dfcd79fc Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 24 Nov 2023 14:00:27 +0800 Subject: [PATCH 20/20] Test a portion of cases for non-UTC time zone in pre-merge --- jenkins/spark-premerge-build.sh | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 3ebf720a411..c9b1369807c 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -154,17 +154,12 @@ ci_2() { # Download a Scala 2.12 build of spark prepare_spark $SPARK_VER 2.12 + ./integration_tests/run_pyspark_from_build.sh - - # Temporarily test non-UTC TZ - # Only test the impacted 43 files - # Split multiple parts to test to avoid OOM: refer to: https://github.com/NVIDIA/spark-rapids/issues/9829 - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'aqe_test.py or arithmetic_ops_test.py or array_test.py or ast_test.py or cache_test.py or cast_test.py or cmp_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'collection_ops_test.py or conditionals_test.py or csv_test.py or datasourcev2_read_test.py or date_time_test.py or expand_exec_test.py or explain_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'fastparquet_compatibility_test.py or generate_expr_test.py or hash_aggregate_test.py or hashing_test.py or hive_delimited_text_test.py or hive_write_test.py or join_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'json_test.py or limit_test.py or map_test.py or mortgage_test.py or orc_cast_test.py or orc_test.py or orc_write_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'parquet_test.py or parquet_write_test.py or qa_nightly_select_test.py or repart_test.py or row-based_udf_test.py or row_conversion_test.py or sample_test.py' - TZ=Asia/Shanghai ./integration_tests/run_pyspark_from_build.sh -k 'schema_evolution_test.py or sort_test.py or struct_test.py or subquery_test.py or time_window_test.py or udf_test.py or window_function_test.py or parquet_testing_test.py' + # Test a portion of cases for non-UTC time zone because of limited GPU resources. + # Here testing: parquet scan, orc scan, csv scan, cast, TimeZoneAwareExpression, FromUTCTimestamp + # Nightly CIs will cover all the cases. + TZ=Iran TEST='test_parquet_read_round_trip or test_read_round_trip or test_basic_csv_read or test_cast_string_ts_valid_format or test_unix_timestamp or test_from_utc_timestamp' ./integration_tests/run_pyspark_from_build.sh # enable avro test separately INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh